21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_USE_X87CONTROL
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
36 #include "ompt-specific.h"
42 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
72 static inline int __kmp_get_monotonicity(
enum sched_type schedule,
73 bool use_hier =
false) {
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
96 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
97 dispatch_private_info_template<T> *pr,
99 typename traits_t<T>::signed_t st,
101 kmp_uint64 *cur_chunk,
103 typename traits_t<T>::signed_t chunk,
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
116 typedef typename traits_t<T>::signed_t ST;
120 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
142 #if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 pr->flags.nomerge = TRUE;
158 pr->flags.nomerge = FALSE;
160 pr->type_size = traits_t<T>::type_size;
162 pr->flags.ordered = TRUE;
166 pr->flags.ordered = FALSE;
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
174 schedule = __kmp_static;
176 if (schedule == kmp_sch_runtime) {
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185 schedule = __kmp_guided;
187 schedule = __kmp_static;
191 chunk = team->t.t_sched.chunk;
200 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
209 schedule = __kmp_guided;
212 chunk = KMP_DEFAULT_CHUNK;
218 schedule = __kmp_auto;
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
232 #if KMP_STATIC_STEAL_ENABLED
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
258 chunk = team->t.t_sched.chunk * chunk;
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
277 pr->u.p.parm1 = chunk;
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
301 tc = (UT)(lb - ub) / (-st) + 1;
309 tc = (UT)(ub - lb) / st + 1;
315 #if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid)) {
327 pr->u.p.last_upper = ub + st;
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
341 #if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
346 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
353 T small_chunk, extras;
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
358 init =
id * small_chunk + (
id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (
id + 1) % nproc;
369 if (traits_t<T>::type_size > 4) {
375 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376 pr->u.p.th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
378 __kmp_init_lock(pr->u.p.th_steal_lock);
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
393 case kmp_sch_static_balanced: {
398 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
408 pr->u.p.parm1 = (
id == tc - 1);
411 pr->u.p.parm1 = FALSE;
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init =
id * small_chunk + (
id < extras ? id : extras);
418 limit = init + small_chunk - (
id < extras ? 0 : 1);
419 pr->u.p.parm1 = (
id == nproc - 1);
425 pr->u.p.parm1 = TRUE;
429 pr->u.p.parm1 = FALSE;
435 if (itt_need_metadata_reporting)
437 *cur_chunk = limit - init + 1;
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
460 case kmp_sch_static_balanced_chunked: {
463 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
466 schedule = kmp_sch_static_greedy;
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
474 case kmp_sch_guided_iterative_chunked: {
477 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
482 if ((2L * chunk + 1) * nproc >= tc) {
484 schedule = kmp_sch_dynamic_chunked;
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(
double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc;
492 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
495 schedule = kmp_sch_static_greedy;
499 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
511 if ((2L * chunk + 1) * nproc >= tc) {
513 schedule = kmp_sch_dynamic_chunked;
518 #if KMP_USE_X87CONTROL
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC);
532 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
539 x = (
long double)1.0 - (
long double)0.5 / nproc;
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
560 *(DBL *)&pr->u.p.parm3 = x;
573 p = __kmp_pow<UT>(x, right);
578 }
while (p > target && right < (1 << 27));
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
601 pr->u.p.parm2 = cross;
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
613 #if KMP_USE_X87CONTROL
615 _control87(oldFpcw, _MCW_PC);
619 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
622 schedule = kmp_sch_static_greedy;
628 case kmp_sch_static_greedy:
631 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
640 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
644 case kmp_sch_trapezoidal: {
647 T parm1, parm2, parm3, parm4;
649 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
655 parm2 = (tc / (2 * nproc));
665 }
else if (parm1 > parm2) {
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
679 parm4 = (parm2 - parm1) / parm4;
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
695 KMP_HNT(GetNewerLibrary),
700 pr->schedule = schedule;
703 #if KMP_USE_HIER_SCHED
704 template <
typename T>
705 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
709 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
725 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (
int i = 0; i < num_disp_buff; ++i) {
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
747 &team->t.t_disp_buffer[i]);
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
758 template <
typename T>
761 T ub,
typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk,
int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T>
volatile *sh;
772 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
776 __kmp_assert_valid_gtid(gtid);
778 if (!TCR_4(__kmp_init_parallel))
779 __kmp_parallel_initialize();
781 __kmp_resume_if_soft_paused();
783 #if INCLUDE_SSC_MARKS
784 SSC_MARK_DISPATCH_INIT();
787 typedef typename traits_t<T>::signed_t ST;
791 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d "
792 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
793 traits_t<ST>::spec, traits_t<T>::spec,
794 traits_t<T>::spec, traits_t<ST>::spec);
795 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
796 __kmp_str_free(&buff);
800 th = __kmp_threads[gtid];
801 team = th->th.th_team;
802 active = !team->t.t_serialized;
803 th->th.th_ident = loc;
808 if (schedule == __kmp_static) {
814 #if KMP_USE_HIER_SCHED
820 my_buffer_index = th->th.th_dispatch->th_disp_index;
821 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
823 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
824 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
829 if (pr->flags.use_hier) {
831 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. "
832 "Disabling hierarchical scheduling.\n",
834 pr->flags.use_hier = FALSE;
837 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
840 if (!ordered && !pr->flags.use_hier)
841 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
843 #endif // KMP_USE_HIER_SCHED
846 kmp_uint64 cur_chunk = chunk;
847 int itt_need_metadata_reporting =
848 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
849 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
850 team->t.t_active_level == 1;
853 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
854 th->th.th_dispatch->th_disp_buffer);
856 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
857 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
859 my_buffer_index = th->th.th_dispatch->th_disp_index++;
862 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
864 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
866 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
867 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
871 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
875 chunk, (T)th->th.th_team_nproc,
876 (T)th->th.th_info.ds.ds_tid);
878 if (pr->flags.ordered == 0) {
879 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
880 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
882 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
883 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
891 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
892 "sh->buffer_index:%d\n",
893 gtid, my_buffer_index, sh->buffer_index));
894 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
895 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
899 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
900 "sh->buffer_index:%d\n",
901 gtid, my_buffer_index, sh->buffer_index));
903 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
904 th->th.th_dispatch->th_dispatch_sh_current =
905 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
907 if (pr->flags.ordered) {
908 __kmp_itt_ordered_init(gtid);
911 if (itt_need_metadata_reporting) {
913 kmp_uint64 schedtype = 0;
915 case kmp_sch_static_chunked:
916 case kmp_sch_static_balanced:
918 case kmp_sch_static_greedy:
919 cur_chunk = pr->u.p.parm1;
921 case kmp_sch_dynamic_chunked:
924 case kmp_sch_guided_iterative_chunked:
925 case kmp_sch_guided_analytical_chunked:
935 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
937 #if KMP_USE_HIER_SCHED
938 if (pr->flags.use_hier) {
940 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
942 #endif // KMP_USER_HIER_SCHED
950 buff = __kmp_str_format(
951 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
953 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
954 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
955 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
956 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
957 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
958 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
959 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
960 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
961 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
962 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
963 __kmp_str_free(&buff);
966 #if (KMP_STATIC_STEAL_ENABLED)
972 if (pr->schedule == kmp_sch_static_steal) {
976 volatile T *p = &pr->u.p.static_steal_counter;
979 #endif // ( KMP_STATIC_STEAL_ENABLED )
981 #if OMPT_SUPPORT && OMPT_OPTIONAL
982 if (ompt_enabled.ompt_callback_work) {
983 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
984 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
985 ompt_callbacks.ompt_callback(ompt_callback_work)(
986 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
987 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
990 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
998 template <
typename UT>
999 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
1000 typedef typename traits_t<UT>::signed_t ST;
1001 __kmp_assert_valid_gtid(gtid);
1002 kmp_info_t *th = __kmp_threads[gtid];
1004 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1005 if (!th->th.th_team->t.t_serialized) {
1007 dispatch_private_info_template<UT> *pr =
1008 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1009 th->th.th_dispatch->th_dispatch_pr_current);
1010 dispatch_shared_info_template<UT>
volatile *sh =
1011 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1012 th->th.th_dispatch->th_dispatch_sh_current);
1013 KMP_DEBUG_ASSERT(pr);
1014 KMP_DEBUG_ASSERT(sh);
1015 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1016 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1018 if (pr->ordered_bumped) {
1021 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1023 pr->ordered_bumped = 0;
1025 UT lower = pr->u.p.ordered_lower;
1031 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: "
1032 "ordered_iteration:%%%s lower:%%%s\n",
1033 traits_t<UT>::spec, traits_t<UT>::spec);
1034 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1035 __kmp_str_free(&buff);
1039 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1040 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1046 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: "
1047 "ordered_iteration:%%%s lower:%%%s\n",
1048 traits_t<UT>::spec, traits_t<UT>::spec);
1049 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1050 __kmp_str_free(&buff);
1054 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1057 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1060 #ifdef KMP_GOMP_COMPAT
1062 template <
typename UT>
1063 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1064 typedef typename traits_t<UT>::signed_t ST;
1065 __kmp_assert_valid_gtid(gtid);
1066 kmp_info_t *th = __kmp_threads[gtid];
1068 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1069 if (!th->th.th_team->t.t_serialized) {
1071 dispatch_private_info_template<UT> *pr =
1072 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1073 th->th.th_dispatch->th_dispatch_pr_current);
1074 dispatch_shared_info_template<UT>
volatile *sh =
1075 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1076 th->th.th_dispatch->th_dispatch_sh_current);
1077 KMP_DEBUG_ASSERT(pr);
1078 KMP_DEBUG_ASSERT(sh);
1079 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1080 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1083 UT lower = pr->u.p.ordered_lower;
1084 UT upper = pr->u.p.ordered_upper;
1085 UT inc = upper - lower + 1;
1087 if (pr->ordered_bumped == inc) {
1090 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1092 pr->ordered_bumped = 0;
1094 inc -= pr->ordered_bumped;
1100 buff = __kmp_str_format(
1101 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1102 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1103 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1104 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1105 __kmp_str_free(&buff);
1109 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1110 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1113 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting "
1114 "ordered_bumped to zero\n",
1116 pr->ordered_bumped = 0;
1122 buff = __kmp_str_format(
1123 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1124 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1125 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1126 traits_t<UT>::spec);
1128 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1129 __kmp_str_free(&buff);
1133 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1137 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1142 template <
typename T>
1143 int __kmp_dispatch_next_algorithm(
int gtid,
1144 dispatch_private_info_template<T> *pr,
1145 dispatch_shared_info_template<T>
volatile *sh,
1146 kmp_int32 *p_last, T *p_lb, T *p_ub,
1147 typename traits_t<T>::signed_t *p_st, T nproc,
1149 typedef typename traits_t<T>::unsigned_t UT;
1150 typedef typename traits_t<T>::signed_t ST;
1151 typedef typename traits_t<T>::floating_t DBL;
1156 UT limit, trip, init;
1157 kmp_info_t *th = __kmp_threads[gtid];
1158 kmp_team_t *team = th->th.th_team;
1160 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1161 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1162 KMP_DEBUG_ASSERT(pr);
1163 KMP_DEBUG_ASSERT(sh);
1164 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1170 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1171 "sh:%%p nproc:%%%s tid:%%%s\n",
1172 traits_t<T>::spec, traits_t<T>::spec);
1173 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1174 __kmp_str_free(&buff);
1179 if (pr->u.p.tc == 0) {
1181 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1187 switch (pr->schedule) {
1188 #if (KMP_STATIC_STEAL_ENABLED)
1189 case kmp_sch_static_steal: {
1190 T chunk = pr->u.p.parm1;
1193 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1196 trip = pr->u.p.tc - 1;
1198 if (traits_t<T>::type_size > 4) {
1201 kmp_lock_t *lck = pr->u.p.th_steal_lock;
1202 KMP_DEBUG_ASSERT(lck != NULL);
1203 if (pr->u.p.count < (UT)pr->u.p.ub) {
1204 __kmp_acquire_lock(lck, gtid);
1206 init = (pr->u.p.count)++;
1207 status = (init < (UT)pr->u.p.ub);
1208 __kmp_release_lock(lck, gtid);
1213 kmp_info_t **other_threads = team->t.t_threads;
1214 int while_limit = pr->u.p.parm3;
1215 int while_index = 0;
1216 T
id = pr->u.p.static_steal_counter;
1217 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1218 __kmp_dispatch_num_buffers;
1222 while ((!status) && (while_limit != ++while_index)) {
1223 dispatch_private_info_template<T> *victim;
1225 T victimIdx = pr->u.p.parm4;
1226 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1227 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1228 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1229 KMP_DEBUG_ASSERT(victim);
1230 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1231 oldVictimIdx != victimIdx) {
1232 victimIdx = (victimIdx + 1) % nproc;
1233 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1234 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1235 KMP_DEBUG_ASSERT(victim);
1237 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1242 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1243 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1247 lck = victim->u.p.th_steal_lock;
1248 KMP_ASSERT(lck != NULL);
1249 __kmp_acquire_lock(lck, gtid);
1250 limit = victim->u.p.ub;
1251 if (victim->u.p.count >= limit ||
1252 (remaining = limit - victim->u.p.count) < 2) {
1253 __kmp_release_lock(lck, gtid);
1254 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1259 if (remaining > 3) {
1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1262 init = (victim->u.p.ub -= (remaining >> 2));
1265 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1266 init = (victim->u.p.ub -= 1);
1268 __kmp_release_lock(lck, gtid);
1270 KMP_DEBUG_ASSERT(init + 1 <= limit);
1271 pr->u.p.parm4 = victimIdx;
1275 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1276 pr->u.p.count = init + 1;
1278 __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1293 union_i4 vold, vnew;
1294 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1297 while (!KMP_COMPARE_AND_STORE_ACQ64(
1298 (
volatile kmp_int64 *)&pr->u.p.count,
1299 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1300 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1302 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1307 init = vnew.p.count;
1308 status = (init < (UT)vnew.p.ub);
1312 kmp_info_t **other_threads = team->t.t_threads;
1313 int while_limit = pr->u.p.parm3;
1314 int while_index = 0;
1315 T
id = pr->u.p.static_steal_counter;
1316 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1317 __kmp_dispatch_num_buffers;
1321 while ((!status) && (while_limit != ++while_index)) {
1322 dispatch_private_info_template<T> *victim;
1323 union_i4 vold, vnew;
1324 kmp_int32 remaining;
1325 T victimIdx = pr->u.p.parm4;
1326 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1327 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1328 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1329 KMP_DEBUG_ASSERT(victim);
1330 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1331 oldVictimIdx != victimIdx) {
1332 victimIdx = (victimIdx + 1) % nproc;
1333 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1334 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1335 KMP_DEBUG_ASSERT(victim);
1337 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1342 pr->u.p.parm4 = victimIdx;
1344 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1347 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1348 if (vnew.p.count >= (UT)vnew.p.ub ||
1349 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1350 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1353 if (remaining > 3) {
1354 vnew.p.ub -= (remaining >> 2);
1358 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1360 if (KMP_COMPARE_AND_STORE_ACQ64(
1361 (
volatile kmp_int64 *)&victim->u.p.count,
1362 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1363 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1365 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1366 vold.p.ub - vnew.p.ub);
1371 vold.p.count = init + 1;
1373 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1375 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1390 start = pr->u.p.parm2;
1392 limit = chunk + init - 1;
1394 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1396 KMP_DEBUG_ASSERT(init <= trip);
1397 if ((last = (limit >= trip)) != 0)
1403 *p_lb = start + init;
1404 *p_ub = start + limit;
1406 *p_lb = start + init * incr;
1407 *p_ub = start + limit * incr;
1410 if (pr->flags.ordered) {
1411 pr->u.p.ordered_lower = init;
1412 pr->u.p.ordered_upper = limit;
1417 #endif // ( KMP_STATIC_STEAL_ENABLED )
1418 case kmp_sch_static_balanced: {
1421 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1424 if ((status = !pr->u.p.count) != 0) {
1428 last = pr->u.p.parm1;
1432 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1436 case kmp_sch_static_greedy:
1438 case kmp_sch_static_chunked: {
1441 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1442 "kmp_sch_static_[affinity|chunked] case\n",
1444 parm1 = pr->u.p.parm1;
1446 trip = pr->u.p.tc - 1;
1447 init = parm1 * (pr->u.p.count + tid);
1449 if ((status = (init <= trip)) != 0) {
1452 limit = parm1 + init - 1;
1454 if ((last = (limit >= trip)) != 0)
1460 pr->u.p.count += nproc;
1463 *p_lb = start + init;
1464 *p_ub = start + limit;
1466 *p_lb = start + init * incr;
1467 *p_ub = start + limit * incr;
1470 if (pr->flags.ordered) {
1471 pr->u.p.ordered_lower = init;
1472 pr->u.p.ordered_upper = limit;
1478 case kmp_sch_dynamic_chunked: {
1479 T chunk = pr->u.p.parm1;
1483 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1486 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1487 trip = pr->u.p.tc - 1;
1489 if ((status = (init <= trip)) == 0) {
1496 limit = chunk + init - 1;
1499 if ((last = (limit >= trip)) != 0)
1506 *p_lb = start + init;
1507 *p_ub = start + limit;
1509 *p_lb = start + init * incr;
1510 *p_ub = start + limit * incr;
1513 if (pr->flags.ordered) {
1514 pr->u.p.ordered_lower = init;
1515 pr->u.p.ordered_upper = limit;
1521 case kmp_sch_guided_iterative_chunked: {
1522 T chunkspec = pr->u.p.parm1;
1523 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1530 init = sh->u.s.iteration;
1531 remaining = trip - init;
1532 if (remaining <= 0) {
1541 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1543 remaining = trip - init;
1544 if (remaining <= 0) {
1549 if ((T)remaining > chunkspec) {
1550 limit = init + chunkspec - 1;
1553 limit = init + remaining - 1;
1559 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1560 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1561 (ST)init, (ST)limit)) {
1573 *p_lb = start + init * incr;
1574 *p_ub = start + limit * incr;
1575 if (pr->flags.ordered) {
1576 pr->u.p.ordered_lower = init;
1577 pr->u.p.ordered_upper = limit;
1591 T chunk = pr->u.p.parm1;
1593 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1599 init = sh->u.s.iteration;
1600 remaining = trip - init;
1601 if (remaining <= 0) {
1605 KMP_DEBUG_ASSERT(init % chunk == 0);
1607 if ((T)remaining < pr->u.p.parm2) {
1610 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1612 remaining = trip - init;
1613 if (remaining <= 0) {
1618 if ((T)remaining > chunk) {
1619 limit = init + chunk - 1;
1622 limit = init + remaining - 1;
1628 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1629 UT rem = span % chunk;
1631 span += chunk - rem;
1632 limit = init + span;
1633 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1634 (ST)init, (ST)limit)) {
1646 *p_lb = start + init * incr;
1647 *p_ub = start + limit * incr;
1648 if (pr->flags.ordered) {
1649 pr->u.p.ordered_lower = init;
1650 pr->u.p.ordered_upper = limit;
1661 case kmp_sch_guided_analytical_chunked: {
1662 T chunkspec = pr->u.p.parm1;
1664 #if KMP_USE_X87CONTROL
1667 unsigned int oldFpcw;
1668 unsigned int fpcwSet = 0;
1670 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1671 "kmp_sch_guided_analytical_chunked case\n",
1676 KMP_DEBUG_ASSERT(nproc > 1);
1677 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1681 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1682 if (chunkIdx >= (UT)pr->u.p.parm2) {
1685 init = chunkIdx * chunkspec + pr->u.p.count;
1688 if ((status = (init > 0 && init <= trip)) != 0) {
1689 limit = init + chunkspec - 1;
1691 if ((last = (limit >= trip)) != 0)
1701 #if KMP_USE_X87CONTROL
1706 oldFpcw = _control87(0, 0);
1707 _control87(_PC_64, _MCW_PC);
1712 init = __kmp_dispatch_guided_remaining<T>(
1713 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1714 KMP_DEBUG_ASSERT(init);
1718 limit = trip - __kmp_dispatch_guided_remaining<T>(
1719 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1720 KMP_ASSERT(init <= limit);
1722 KMP_DEBUG_ASSERT(limit <= trip);
1729 #if KMP_USE_X87CONTROL
1733 if (fpcwSet && (oldFpcw & fpcwSet))
1734 _control87(oldFpcw, _MCW_PC);
1741 *p_lb = start + init * incr;
1742 *p_ub = start + limit * incr;
1743 if (pr->flags.ordered) {
1744 pr->u.p.ordered_lower = init;
1745 pr->u.p.ordered_upper = limit;
1756 case kmp_sch_trapezoidal: {
1758 T parm2 = pr->u.p.parm2;
1759 T parm3 = pr->u.p.parm3;
1760 T parm4 = pr->u.p.parm4;
1762 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1765 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1767 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1768 trip = pr->u.p.tc - 1;
1770 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1777 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1780 if ((last = (limit >= trip)) != 0)
1787 *p_lb = start + init;
1788 *p_ub = start + limit;
1790 *p_lb = start + init * incr;
1791 *p_ub = start + limit * incr;
1794 if (pr->flags.ordered) {
1795 pr->u.p.ordered_lower = init;
1796 pr->u.p.ordered_upper = limit;
1803 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1804 KMP_HNT(GetNewerLibrary),
1812 if (pr->flags.ordered) {
1815 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1816 "ordered_lower:%%%s ordered_upper:%%%s\n",
1817 traits_t<UT>::spec, traits_t<UT>::spec);
1818 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1819 __kmp_str_free(&buff);
1824 buff = __kmp_str_format(
1825 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1826 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1827 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1828 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1829 __kmp_str_free(&buff);
1838 #if OMPT_SUPPORT && OMPT_OPTIONAL
1839 #define OMPT_LOOP_END \
1840 if (status == 0) { \
1841 if (ompt_enabled.ompt_callback_work) { \
1842 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1843 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1844 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1845 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1846 &(task_info->task_data), 0, codeptr); \
1851 #define OMPT_LOOP_END // no-op
1854 #if KMP_STATS_ENABLED
1855 #define KMP_STATS_LOOP_END \
1857 kmp_int64 u, l, t, i; \
1858 l = (kmp_int64)(*p_lb); \
1859 u = (kmp_int64)(*p_ub); \
1860 i = (kmp_int64)(pr->u.p.st); \
1861 if (status == 0) { \
1863 KMP_POP_PARTITIONED_TIMER(); \
1864 } else if (i == 1) { \
1869 } else if (i < 0) { \
1871 t = (l - u) / (-i) + 1; \
1876 t = (u - l) / i + 1; \
1880 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1883 #define KMP_STATS_LOOP_END
1886 template <
typename T>
1887 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1889 typename traits_t<T>::signed_t *p_st
1890 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1896 typedef typename traits_t<T>::unsigned_t UT;
1897 typedef typename traits_t<T>::signed_t ST;
1902 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1905 dispatch_private_info_template<T> *pr;
1906 __kmp_assert_valid_gtid(gtid);
1907 kmp_info_t *th = __kmp_threads[gtid];
1908 kmp_team_t *team = th->th.th_team;
1910 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1913 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1914 gtid, p_lb, p_ub, p_st, p_last));
1916 if (team->t.t_serialized) {
1918 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1919 th->th.th_dispatch->th_disp_buffer);
1920 KMP_DEBUG_ASSERT(pr);
1922 if ((status = (pr->u.p.tc != 0)) == 0) {
1929 if (__kmp_env_consistency_check) {
1930 if (pr->pushed_ws != ct_none) {
1931 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1934 }
else if (pr->flags.nomerge) {
1937 UT limit, trip, init;
1939 T chunk = pr->u.p.parm1;
1941 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1944 init = chunk * pr->u.p.count++;
1945 trip = pr->u.p.tc - 1;
1947 if ((status = (init <= trip)) == 0) {
1954 if (__kmp_env_consistency_check) {
1955 if (pr->pushed_ws != ct_none) {
1956 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1961 limit = chunk + init - 1;
1964 if ((last = (limit >= trip)) != 0) {
1967 pr->u.p.last_upper = pr->u.p.ub;
1975 *p_lb = start + init;
1976 *p_ub = start + limit;
1978 *p_lb = start + init * incr;
1979 *p_ub = start + limit * incr;
1982 if (pr->flags.ordered) {
1983 pr->u.p.ordered_lower = init;
1984 pr->u.p.ordered_upper = limit;
1989 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d "
1990 "ordered_lower:%%%s ordered_upper:%%%s\n",
1991 traits_t<UT>::spec, traits_t<UT>::spec);
1992 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1993 pr->u.p.ordered_upper));
1994 __kmp_str_free(&buff);
2004 pr->u.p.last_upper = *p_ub;
2015 buff = __kmp_str_format(
2016 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2017 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2018 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2019 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2020 (p_last ? *p_last : 0), status));
2021 __kmp_str_free(&buff);
2024 #if INCLUDE_SSC_MARKS
2025 SSC_MARK_DISPATCH_NEXT();
2032 dispatch_shared_info_template<T>
volatile *sh;
2034 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2035 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2037 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2038 th->th.th_dispatch->th_dispatch_pr_current);
2039 KMP_DEBUG_ASSERT(pr);
2040 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2041 th->th.th_dispatch->th_dispatch_sh_current);
2042 KMP_DEBUG_ASSERT(sh);
2044 #if KMP_USE_HIER_SCHED
2045 if (pr->flags.use_hier)
2046 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2048 #endif // KMP_USE_HIER_SCHED
2049 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2050 p_st, th->th.th_team_nproc,
2051 th->th.th_info.ds.ds_tid);
2056 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2061 buff = __kmp_str_format(
2062 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2063 traits_t<UT>::spec);
2064 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2065 __kmp_str_free(&buff);
2069 #if KMP_USE_HIER_SCHED
2070 pr->flags.use_hier = FALSE;
2072 if ((ST)num_done == th->th.th_team_nproc - 1) {
2073 #if (KMP_STATIC_STEAL_ENABLED)
2074 if (pr->schedule == kmp_sch_static_steal &&
2075 traits_t<T>::type_size > 4) {
2077 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2078 __kmp_dispatch_num_buffers;
2079 kmp_info_t **other_threads = team->t.t_threads;
2081 for (i = 0; i < th->th.th_team_nproc; ++i) {
2082 dispatch_private_info_template<T> *buf =
2083 reinterpret_cast<dispatch_private_info_template<T> *
>(
2084 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2085 kmp_lock_t *lck = buf->u.p.th_steal_lock;
2086 KMP_ASSERT(lck != NULL);
2087 __kmp_destroy_lock(lck);
2089 buf->u.p.th_steal_lock = NULL;
2097 sh->u.s.num_done = 0;
2098 sh->u.s.iteration = 0;
2101 if (pr->flags.ordered) {
2102 sh->u.s.ordered_iteration = 0;
2107 sh->buffer_index += __kmp_dispatch_num_buffers;
2108 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2109 gtid, sh->buffer_index));
2114 if (__kmp_env_consistency_check) {
2115 if (pr->pushed_ws != ct_none) {
2116 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2120 th->th.th_dispatch->th_deo_fcn = NULL;
2121 th->th.th_dispatch->th_dxo_fcn = NULL;
2122 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2123 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2127 pr->u.p.last_upper = pr->u.p.ub;
2130 if (p_last != NULL && status != 0)
2138 buff = __kmp_str_format(
2139 "__kmp_dispatch_next: T#%%d normal case: "
2140 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2141 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2142 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2143 (p_last ? *p_last : 0), status));
2144 __kmp_str_free(&buff);
2147 #if INCLUDE_SSC_MARKS
2148 SSC_MARK_DISPATCH_NEXT();
2155 template <
typename T>
2156 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2157 kmp_int32 *plastiter, T *plower, T *pupper,
2158 typename traits_t<T>::signed_t incr) {
2159 typedef typename traits_t<T>::unsigned_t UT;
2166 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2167 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2169 typedef typename traits_t<T>::signed_t ST;
2173 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "
2174 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2175 traits_t<T>::spec, traits_t<T>::spec,
2176 traits_t<ST>::spec, traits_t<T>::spec);
2177 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2178 __kmp_str_free(&buff);
2182 if (__kmp_env_consistency_check) {
2184 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2187 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2197 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2200 __kmp_assert_valid_gtid(gtid);
2201 th = __kmp_threads[gtid];
2202 team = th->th.th_team;
2203 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2204 nteams = th->th.th_teams_size.nteams;
2205 team_id = team->t.t_master_tid;
2206 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2210 trip_count = *pupper - *plower + 1;
2211 }
else if (incr == -1) {
2212 trip_count = *plower - *pupper + 1;
2213 }
else if (incr > 0) {
2215 trip_count = (UT)(*pupper - *plower) / incr + 1;
2217 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2220 if (trip_count <= nteams) {
2222 __kmp_static == kmp_sch_static_greedy ||
2224 kmp_sch_static_balanced);
2226 if (team_id < trip_count) {
2227 *pupper = *plower = *plower + team_id * incr;
2229 *plower = *pupper + incr;
2231 if (plastiter != NULL)
2232 *plastiter = (team_id == trip_count - 1);
2234 if (__kmp_static == kmp_sch_static_balanced) {
2235 UT chunk = trip_count / nteams;
2236 UT extras = trip_count % nteams;
2238 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2239 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2240 if (plastiter != NULL)
2241 *plastiter = (team_id == nteams - 1);
2244 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2246 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2248 *plower += team_id * chunk_inc_count;
2249 *pupper = *plower + chunk_inc_count - incr;
2252 if (*pupper < *plower)
2253 *pupper = traits_t<T>::max_value;
2254 if (plastiter != NULL)
2255 *plastiter = *plower <= upper && *pupper > upper - incr;
2256 if (*pupper > upper)
2259 if (*pupper > *plower)
2260 *pupper = traits_t<T>::min_value;
2261 if (plastiter != NULL)
2262 *plastiter = *plower >= upper && *pupper < upper - incr;
2263 if (*pupper < upper)
2295 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298 OMPT_STORE_RETURN_ADDRESS(gtid);
2300 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2307 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2308 KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310 OMPT_STORE_RETURN_ADDRESS(gtid);
2312 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2320 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2321 KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323 OMPT_STORE_RETURN_ADDRESS(gtid);
2325 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2333 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2334 KMP_DEBUG_ASSERT(__kmp_init_serial);
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336 OMPT_STORE_RETURN_ADDRESS(gtid);
2338 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2352 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2354 KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 OMPT_STORE_RETURN_ADDRESS(gtid);
2358 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2359 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2362 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2364 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2366 KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368 OMPT_STORE_RETURN_ADDRESS(gtid);
2370 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2371 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2374 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2376 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2378 KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380 OMPT_STORE_RETURN_ADDRESS(gtid);
2382 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2383 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2386 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2388 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2390 KMP_DEBUG_ASSERT(__kmp_init_serial);
2391 #if OMPT_SUPPORT && OMPT_OPTIONAL
2392 OMPT_STORE_RETURN_ADDRESS(gtid);
2394 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2395 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2412 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2413 #if OMPT_SUPPORT && OMPT_OPTIONAL
2414 OMPT_STORE_RETURN_ADDRESS(gtid);
2416 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2417 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2419 OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431 OMPT_STORE_RETURN_ADDRESS(gtid);
2433 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2434 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2436 OMPT_LOAD_RETURN_ADDRESS(gtid)
2445 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2446 #if OMPT_SUPPORT && OMPT_OPTIONAL
2447 OMPT_STORE_RETURN_ADDRESS(gtid);
2449 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2450 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2452 OMPT_LOAD_RETURN_ADDRESS(gtid)
2461 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2463 #if OMPT_SUPPORT && OMPT_OPTIONAL
2464 OMPT_STORE_RETURN_ADDRESS(gtid);
2466 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2467 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2469 OMPT_LOAD_RETURN_ADDRESS(gtid)
2481 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2488 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2495 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2502 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2509 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2510 return value == checker;
2513 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2514 return value != checker;
2517 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2518 return value < checker;
2521 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2522 return value >= checker;
2525 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2526 return value <= checker;
2530 __kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2531 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2535 volatile kmp_uint32 *spin = spinner;
2536 kmp_uint32 check = checker;
2538 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2541 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2542 KMP_INIT_YIELD(spins);
2544 while (!f(r = TCR_4(*spin), check)) {
2545 KMP_FSYNC_SPIN_PREPARE(obj);
2550 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2552 KMP_FSYNC_SPIN_ACQUIRED(obj);
2556 void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2557 kmp_uint32 (*pred)(
void *, kmp_uint32),
2561 void *spin = spinner;
2562 kmp_uint32 check = checker;
2564 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2566 KMP_FSYNC_SPIN_INIT(obj, spin);
2567 KMP_INIT_YIELD(spins);
2569 while (!f(spin, check)) {
2570 KMP_FSYNC_SPIN_PREPARE(obj);
2573 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2575 KMP_FSYNC_SPIN_ACQUIRED(obj);
2580 #ifdef KMP_GOMP_COMPAT
2582 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2584 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2586 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2590 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2592 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2594 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2598 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2600 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2602 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2606 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2608 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2610 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2614 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2615 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2618 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2619 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2622 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2623 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2626 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2627 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);