LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60  KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68  KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84  int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86  kmp_internal_control_t *new_icvs,
87  ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90  int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96  kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113  int i;
114  kmp_info_t **other_threads;
115  size_t stack_data;
116  char *stack_addr;
117  size_t stack_size;
118  char *stack_base;
119 
120  KA_TRACE(
121  1000,
122  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
123  __kmp_nth, __kmp_all_nth));
124 
125  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128  __kmp_init_gtid for this to work. */
129 
130  if (!TCR_4(__kmp_init_gtid))
131  return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134  if (TCR_4(__kmp_gtid_mode) >= 3) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136  return __kmp_gtid;
137  }
138 #endif
139  if (TCR_4(__kmp_gtid_mode) >= 2) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141  return __kmp_gtid_get_specific();
142  }
143  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145  stack_addr = (char *)&stack_data;
146  other_threads = __kmp_threads;
147 
148  /* ATT: The code below is a source of potential bugs due to unsynchronized
149  access to __kmp_threads array. For example:
150  1. Current thread loads other_threads[i] to thr and checks it, it is
151  non-NULL.
152  2. Current thread is suspended by OS.
153  3. Another thread unregisters and finishes (debug versions of free()
154  may fill memory with something like 0xEF).
155  4. Current thread is resumed.
156  5. Current thread reads junk from *thr.
157  TODO: Fix it. --ln */
158 
159  for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162  if (!thr)
163  continue;
164 
165  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168  /* stack grows down -- search through all of the active threads */
169 
170  if (stack_addr <= stack_base) {
171  size_t stack_diff = stack_base - stack_addr;
172 
173  if (stack_diff <= stack_size) {
174  /* The only way we can be closer than the allocated */
175  /* stack size is if we are running on this thread. */
176  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177  return i;
178  }
179  }
180  }
181 
182  /* get specific to try and determine our gtid */
183  KA_TRACE(1000,
184  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185  "thread, using TLS\n"));
186  i = __kmp_gtid_get_specific();
187 
188  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
189 
190  /* if we havn't been assigned a gtid, then return code */
191  if (i < 0)
192  return i;
193 
194  /* dynamically updated stack window for uber threads to avoid get_specific
195  call */
196  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197  KMP_FATAL(StackOverflow, i);
198  }
199 
200  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201  if (stack_addr > stack_base) {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205  stack_base);
206  } else {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208  stack_base - stack_addr);
209  }
210 
211  /* Reprint stack bounds for ubermaster since they have been refined */
212  if (__kmp_storage_map) {
213  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216  other_threads[i]->th.th_info.ds.ds_stacksize,
217  "th_%d stack (refinement)", i);
218  }
219  return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223  int gtid;
224 
225  if (!__kmp_init_serial) {
226  gtid = KMP_GTID_DNE;
227  } else
228 #ifdef KMP_TDATA_GTID
229  if (TCR_4(__kmp_gtid_mode) >= 3) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231  gtid = __kmp_gtid;
232  } else
233 #endif
234  if (TCR_4(__kmp_gtid_mode) >= 2) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236  gtid = __kmp_gtid_get_specific();
237  } else {
238  KA_TRACE(1000,
239  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240  gtid = __kmp_get_global_thread_id();
241  }
242 
243  /* we must be a new uber master sibling thread */
244  if (gtid == KMP_GTID_DNE) {
245  KA_TRACE(10,
246  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247  "Registering a new gtid.\n"));
248  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249  if (!__kmp_init_serial) {
250  __kmp_do_serial_initialize();
251  gtid = __kmp_gtid_get_specific();
252  } else {
253  gtid = __kmp_register_root(FALSE);
254  }
255  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257  }
258 
259  KMP_DEBUG_ASSERT(gtid >= 0);
260 
261  return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266  int f;
267  char *stack_beg = NULL;
268  char *stack_end = NULL;
269  int gtid;
270 
271  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272  if (__kmp_storage_map) {
273  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276  gtid = __kmp_gtid_from_thread(th);
277 
278  if (gtid == KMP_GTID_MONITOR) {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%s stack (%s)", "mon",
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  } else {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%d stack (%s)", gtid,
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  }
289  }
290 
291  /* No point in checking ubermaster threads since they use refinement and
292  * cannot overlap */
293  gtid = __kmp_gtid_from_thread(th);
294  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295  KA_TRACE(10,
296  ("__kmp_check_stack_overlap: performing extensive checking\n"));
297  if (stack_beg == NULL) {
298  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300  }
301 
302  for (f = 0; f < __kmp_threads_capacity; f++) {
303  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305  if (f_th && f_th != th) {
306  char *other_stack_end =
307  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308  char *other_stack_beg =
309  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313  /* Print the other stack values before the abort */
314  if (__kmp_storage_map)
315  __kmp_print_storage_map_gtid(
316  -1, other_stack_beg, other_stack_end,
317  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321  __kmp_msg_null);
322  }
323  }
324  }
325  }
326  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332  static int done = FALSE;
333 
334  while (!done) {
335  KMP_YIELD(TRUE);
336  }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342  char const *format, ...) {
343  char buffer[MAX_MESSAGE];
344  va_list ap;
345 
346  va_start(ap, format);
347  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348  p2, (unsigned long)size, format);
349  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350  __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352  int node;
353  if (gtid >= 0) {
354  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355  if (__kmp_storage_map_verbose) {
356  node = __kmp_get_host_node(p1);
357  if (node < 0) /* doesn't work, so don't try this next time */
358  __kmp_storage_map_verbose = FALSE;
359  else {
360  char *last;
361  int lastNode;
362  int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364  const int page_size = KMP_GET_PAGE_SIZE();
365 
366  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368  if (localProc >= 0)
369  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
370  localProc >> 1);
371  else
372  __kmp_printf_no_lock(" GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374  /* The more elaborate format is disabled for now because of the prctl
375  * hanging bug. */
376  do {
377  last = p1;
378  lastNode = node;
379  /* This loop collates adjacent pages with the same host node. */
380  do {
381  (char *)p1 += page_size;
382  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
384  lastNode);
385  } while (p1 <= p2);
386 #else
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
388  (char *)p1 + (page_size - 1),
389  __kmp_get_host_node(p1));
390  if (p1 < p2) {
391  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
392  (char *)p2 + (page_size - 1),
393  __kmp_get_host_node(p2));
394  }
395 #endif
396  }
397  }
398  } else
399  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
400  }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406  char buffer[MAX_MESSAGE];
407  va_list ap;
408 
409  if (__kmp_generate_warnings == kmp_warnings_off) {
410  return;
411  }
412 
413  va_start(ap, format);
414 
415  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417  __kmp_vprintf(kmp_err, buffer, ap);
418  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420  va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424  // Later threads may stall here, but that's ok because abort() will kill them.
425  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427  if (__kmp_debug_buf) {
428  __kmp_dump_debug_buffer();
429  }
430 
431  if (KMP_OS_WINDOWS) {
432  // Let other threads know of abnormal termination and prevent deadlock
433  // if abort happened during library initialization or shutdown
434  __kmp_global.g.g_abort = SIGABRT;
435 
436  /* On Windows* OS by default abort() causes pop-up error box, which stalls
437  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438  boxes. _set_abort_behavior() works well, but this function is not
439  available in VS7 (this is not problem for DLL, but it is a problem for
440  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441  help, at least in some versions of MS C RTL.
442 
443  It seems following sequence is the only way to simulate abort() and
444  avoid pop-up error box. */
445  raise(SIGABRT);
446  _exit(3); // Just in case, if signal ignored, exit anyway.
447  } else {
448  __kmp_unregister_library();
449  abort();
450  }
451 
452  __kmp_infinite_loop();
453  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458  // TODO: Eliminate g_abort global variable and this function.
459  // In case of abort just call abort(), it will kill all the threads.
460  __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464  that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468  gtid);
469 
470  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474  sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476  __kmp_print_storage_map_gtid(
477  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481  &thr->th.th_bar[bs_plain_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483  gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486  &thr->th.th_bar[bs_forkjoin_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488  gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492  &thr->th.th_bar[bs_reduction_barrier + 1],
493  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494  gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499  that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502  int team_id, int num_thr) {
503  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505  header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508  &team->t.t_bar[bs_last_barrier],
509  sizeof(kmp_balign_team_t) * bs_last_barrier,
510  "%s_%d.t_bar", header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513  &team->t.t_bar[bs_plain_barrier + 1],
514  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515  header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518  &team->t.t_bar[bs_forkjoin_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524  &team->t.t_bar[bs_reduction_barrier + 1],
525  sizeof(kmp_balign_team_t),
526  "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529  __kmp_print_storage_map_gtid(
530  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533  __kmp_print_storage_map_gtid(
534  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538  &team->t.t_disp_buffer[num_disp_buff],
539  sizeof(dispatch_shared_info_t) * num_disp_buff,
540  "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() { __kmp_init_memkind(); }
544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
545 
546 /* ------------------------------------------------------------------------ */
547 
548 #if KMP_DYNAMIC_LIB
549 #if KMP_OS_WINDOWS
550 
551 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
552  // TODO: Change to __kmp_break_bootstrap_lock().
553  __kmp_init_bootstrap_lock(lck); // make the lock released
554 }
555 
556 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
557  int i;
558  int thread_count;
559 
560  // PROCESS_DETACH is expected to be called by a thread that executes
561  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
562  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
563  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
564  // threads can be still alive here, although being about to be terminated. The
565  // threads in the array with ds_thread==0 are most suspicious. Actually, it
566  // can be not safe to access the __kmp_threads[].
567 
568  // TODO: does it make sense to check __kmp_roots[] ?
569 
570  // Let's check that there are no other alive threads registered with the OMP
571  // lib.
572  while (1) {
573  thread_count = 0;
574  for (i = 0; i < __kmp_threads_capacity; ++i) {
575  if (!__kmp_threads)
576  continue;
577  kmp_info_t *th = __kmp_threads[i];
578  if (th == NULL)
579  continue;
580  int gtid = th->th.th_info.ds.ds_gtid;
581  if (gtid == gtid_req)
582  continue;
583  if (gtid < 0)
584  continue;
585  DWORD exit_val;
586  int alive = __kmp_is_thread_alive(th, &exit_val);
587  if (alive) {
588  ++thread_count;
589  }
590  }
591  if (thread_count == 0)
592  break; // success
593  }
594 
595  // Assume that I'm alone. Now it might be safe to check and reset locks.
596  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
597  __kmp_reset_lock(&__kmp_forkjoin_lock);
598 #ifdef KMP_DEBUG
599  __kmp_reset_lock(&__kmp_stdio_lock);
600 #endif // KMP_DEBUG
601 }
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606  switch (fdwReason) {
607 
608  case DLL_PROCESS_ATTACH:
609  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611  return TRUE;
612 
613  case DLL_PROCESS_DETACH:
614  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616  if (lpReserved != NULL) {
617  // lpReserved is used for telling the difference:
618  // lpReserved == NULL when FreeLibrary() was called,
619  // lpReserved != NULL when the process terminates.
620  // When FreeLibrary() is called, worker threads remain alive. So they will
621  // release the forkjoin lock by themselves. When the process terminates,
622  // worker threads disappear triggering the problem of unreleased forkjoin
623  // lock as described below.
624 
625  // A worker thread can take the forkjoin lock. The problem comes up if
626  // that worker thread becomes dead before it releases the forkjoin lock.
627  // The forkjoin lock remains taken, while the thread executing
628  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
629  // to take the forkjoin lock and will always fail, so that the application
630  // will never finish [normally]. This scenario is possible if
631  // __kmpc_end() has not been executed. It looks like it's not a corner
632  // case, but common cases:
633  // - the main function was compiled by an alternative compiler;
634  // - the main function was compiled by icl but without /Qopenmp
635  // (application with plugins);
636  // - application terminates by calling C exit(), Fortran CALL EXIT() or
637  // Fortran STOP.
638  // - alive foreign thread prevented __kmpc_end from doing cleanup.
639  //
640  // This is a hack to work around the problem.
641  // TODO: !!! figure out something better.
642  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
643  }
644 
645  __kmp_internal_end_library(__kmp_gtid_get_specific());
646 
647  return TRUE;
648 
649  case DLL_THREAD_ATTACH:
650  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
651 
652  /* if we want to register new siblings all the time here call
653  * __kmp_get_gtid(); */
654  return TRUE;
655 
656  case DLL_THREAD_DETACH:
657  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
658 
659  __kmp_internal_end_thread(__kmp_gtid_get_specific());
660  return TRUE;
661  }
662 
663  return TRUE;
664 }
665 
666 #endif /* KMP_OS_WINDOWS */
667 #endif /* KMP_DYNAMIC_LIB */
668 
669 /* __kmp_parallel_deo -- Wait until it's our turn. */
670 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
671  int gtid = *gtid_ref;
672 #ifdef BUILD_PARALLEL_ORDERED
673  kmp_team_t *team = __kmp_team_from_gtid(gtid);
674 #endif /* BUILD_PARALLEL_ORDERED */
675 
676  if (__kmp_env_consistency_check) {
677  if (__kmp_threads[gtid]->th.th_root->r.r_active)
678 #if KMP_USE_DYNAMIC_LOCK
679  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
680 #else
681  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
682 #endif
683  }
684 #ifdef BUILD_PARALLEL_ORDERED
685  if (!team->t.t_serialized) {
686  KMP_MB();
687  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
688  NULL);
689  KMP_MB();
690  }
691 #endif /* BUILD_PARALLEL_ORDERED */
692 }
693 
694 /* __kmp_parallel_dxo -- Signal the next task. */
695 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
696  int gtid = *gtid_ref;
697 #ifdef BUILD_PARALLEL_ORDERED
698  int tid = __kmp_tid_from_gtid(gtid);
699  kmp_team_t *team = __kmp_team_from_gtid(gtid);
700 #endif /* BUILD_PARALLEL_ORDERED */
701 
702  if (__kmp_env_consistency_check) {
703  if (__kmp_threads[gtid]->th.th_root->r.r_active)
704  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
705  }
706 #ifdef BUILD_PARALLEL_ORDERED
707  if (!team->t.t_serialized) {
708  KMP_MB(); /* Flush all pending memory write invalidates. */
709 
710  /* use the tid of the next thread in this team */
711  /* TODO replace with general release procedure */
712  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
713 
714  KMP_MB(); /* Flush all pending memory write invalidates. */
715  }
716 #endif /* BUILD_PARALLEL_ORDERED */
717 }
718 
719 /* ------------------------------------------------------------------------ */
720 /* The BARRIER for a SINGLE process section is always explicit */
721 
722 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
723  int status;
724  kmp_info_t *th;
725  kmp_team_t *team;
726 
727  if (!TCR_4(__kmp_init_parallel))
728  __kmp_parallel_initialize();
729  __kmp_resume_if_soft_paused();
730 
731  th = __kmp_threads[gtid];
732  team = th->th.th_team;
733  status = 0;
734 
735  th->th.th_ident = id_ref;
736 
737  if (team->t.t_serialized) {
738  status = 1;
739  } else {
740  kmp_int32 old_this = th->th.th_local.this_construct;
741 
742  ++th->th.th_local.this_construct;
743  /* try to set team count to thread count--success means thread got the
744  single block */
745  /* TODO: Should this be acquire or release? */
746  if (team->t.t_construct == old_this) {
747  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
748  th->th.th_local.this_construct);
749  }
750 #if USE_ITT_BUILD
751  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
752  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
753  team->t.t_active_level ==
754  1) { // Only report metadata by master of active team at level 1
755  __kmp_itt_metadata_single(id_ref);
756  }
757 #endif /* USE_ITT_BUILD */
758  }
759 
760  if (__kmp_env_consistency_check) {
761  if (status && push_ws) {
762  __kmp_push_workshare(gtid, ct_psingle, id_ref);
763  } else {
764  __kmp_check_workshare(gtid, ct_psingle, id_ref);
765  }
766  }
767 #if USE_ITT_BUILD
768  if (status) {
769  __kmp_itt_single_start(gtid);
770  }
771 #endif /* USE_ITT_BUILD */
772  return status;
773 }
774 
775 void __kmp_exit_single(int gtid) {
776 #if USE_ITT_BUILD
777  __kmp_itt_single_end(gtid);
778 #endif /* USE_ITT_BUILD */
779  if (__kmp_env_consistency_check)
780  __kmp_pop_workshare(gtid, ct_psingle, NULL);
781 }
782 
783 /* determine if we can go parallel or must use a serialized parallel region and
784  * how many threads we can use
785  * set_nproc is the number of threads requested for the team
786  * returns 0 if we should serialize or only use one thread,
787  * otherwise the number of threads to use
788  * The forkjoin lock is held by the caller. */
789 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
790  int master_tid, int set_nthreads,
791  int enter_teams) {
792  int capacity;
793  int new_nthreads;
794  KMP_DEBUG_ASSERT(__kmp_init_serial);
795  KMP_DEBUG_ASSERT(root && parent_team);
796  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
797 
798  // If dyn-var is set, dynamically adjust the number of desired threads,
799  // according to the method specified by dynamic_mode.
800  new_nthreads = set_nthreads;
801  if (!get__dynamic_2(parent_team, master_tid)) {
802  ;
803  }
804 #ifdef USE_LOAD_BALANCE
805  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
806  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
807  if (new_nthreads == 1) {
808  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
809  "reservation to 1 thread\n",
810  master_tid));
811  return 1;
812  }
813  if (new_nthreads < set_nthreads) {
814  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
815  "reservation to %d threads\n",
816  master_tid, new_nthreads));
817  }
818  }
819 #endif /* USE_LOAD_BALANCE */
820  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
821  new_nthreads = __kmp_avail_proc - __kmp_nth +
822  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
823  if (new_nthreads <= 1) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
825  "reservation to 1 thread\n",
826  master_tid));
827  return 1;
828  }
829  if (new_nthreads < set_nthreads) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
831  "reservation to %d threads\n",
832  master_tid, new_nthreads));
833  } else {
834  new_nthreads = set_nthreads;
835  }
836  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
837  if (set_nthreads > 2) {
838  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
839  new_nthreads = (new_nthreads % set_nthreads) + 1;
840  if (new_nthreads == 1) {
841  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
842  "reservation to 1 thread\n",
843  master_tid));
844  return 1;
845  }
846  if (new_nthreads < set_nthreads) {
847  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
848  "reservation to %d threads\n",
849  master_tid, new_nthreads));
850  }
851  }
852  } else {
853  KMP_ASSERT(0);
854  }
855 
856  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
857  if (__kmp_nth + new_nthreads -
858  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
859  __kmp_max_nth) {
860  int tl_nthreads = __kmp_max_nth - __kmp_nth +
861  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
862  if (tl_nthreads <= 0) {
863  tl_nthreads = 1;
864  }
865 
866  // If dyn-var is false, emit a 1-time warning.
867  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
868  __kmp_reserve_warn = 1;
869  __kmp_msg(kmp_ms_warning,
870  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
871  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
872  }
873  if (tl_nthreads == 1) {
874  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
875  "reduced reservation to 1 thread\n",
876  master_tid));
877  return 1;
878  }
879  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
880  "reservation to %d threads\n",
881  master_tid, tl_nthreads));
882  new_nthreads = tl_nthreads;
883  }
884 
885  // Respect OMP_THREAD_LIMIT
886  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
887  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
888  if (cg_nthreads + new_nthreads -
889  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
890  max_cg_threads) {
891  int tl_nthreads = max_cg_threads - cg_nthreads +
892  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
893  if (tl_nthreads <= 0) {
894  tl_nthreads = 1;
895  }
896 
897  // If dyn-var is false, emit a 1-time warning.
898  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
899  __kmp_reserve_warn = 1;
900  __kmp_msg(kmp_ms_warning,
901  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
902  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
903  }
904  if (tl_nthreads == 1) {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
906  "reduced reservation to 1 thread\n",
907  master_tid));
908  return 1;
909  }
910  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
911  "reservation to %d threads\n",
912  master_tid, tl_nthreads));
913  new_nthreads = tl_nthreads;
914  }
915 
916  // Check if the threads array is large enough, or needs expanding.
917  // See comment in __kmp_register_root() about the adjustment if
918  // __kmp_threads[0] == NULL.
919  capacity = __kmp_threads_capacity;
920  if (TCR_PTR(__kmp_threads[0]) == NULL) {
921  --capacity;
922  }
923  // If it is not for initializing the hidden helper team, we need to take
924  // __kmp_hidden_helper_threads_num out of the capacity because it is included
925  // in __kmp_threads_capacity.
926  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
927  capacity -= __kmp_hidden_helper_threads_num;
928  }
929  if (__kmp_nth + new_nthreads -
930  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
931  capacity) {
932  // Expand the threads array.
933  int slotsRequired = __kmp_nth + new_nthreads -
934  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
935  capacity;
936  int slotsAdded = __kmp_expand_threads(slotsRequired);
937  if (slotsAdded < slotsRequired) {
938  // The threads array was not expanded enough.
939  new_nthreads -= (slotsRequired - slotsAdded);
940  KMP_ASSERT(new_nthreads >= 1);
941 
942  // If dyn-var is false, emit a 1-time warning.
943  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
944  __kmp_reserve_warn = 1;
945  if (__kmp_tp_cached) {
946  __kmp_msg(kmp_ms_warning,
947  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
948  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
949  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
950  } else {
951  __kmp_msg(kmp_ms_warning,
952  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
953  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
954  }
955  }
956  }
957  }
958 
959 #ifdef KMP_DEBUG
960  if (new_nthreads == 1) {
961  KC_TRACE(10,
962  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
963  "dead roots and rechecking; requested %d threads\n",
964  __kmp_get_gtid(), set_nthreads));
965  } else {
966  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
967  " %d threads\n",
968  __kmp_get_gtid(), new_nthreads, set_nthreads));
969  }
970 #endif // KMP_DEBUG
971  return new_nthreads;
972 }
973 
974 /* Allocate threads from the thread pool and assign them to the new team. We are
975  assured that there are enough threads available, because we checked on that
976  earlier within critical section forkjoin */
977 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
978  kmp_info_t *master_th, int master_gtid) {
979  int i;
980  int use_hot_team;
981 
982  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
983  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
984  KMP_MB();
985 
986  /* first, let's setup the master thread */
987  master_th->th.th_info.ds.ds_tid = 0;
988  master_th->th.th_team = team;
989  master_th->th.th_team_nproc = team->t.t_nproc;
990  master_th->th.th_team_master = master_th;
991  master_th->th.th_team_serialized = FALSE;
992  master_th->th.th_dispatch = &team->t.t_dispatch[0];
993 
994 /* make sure we are not the optimized hot team */
995 #if KMP_NESTED_HOT_TEAMS
996  use_hot_team = 0;
997  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
998  if (hot_teams) { // hot teams array is not allocated if
999  // KMP_HOT_TEAMS_MAX_LEVEL=0
1000  int level = team->t.t_active_level - 1; // index in array of hot teams
1001  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1002  if (master_th->th.th_teams_size.nteams > 1) {
1003  ++level; // level was not increased in teams construct for
1004  // team_of_masters
1005  }
1006  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1007  master_th->th.th_teams_level == team->t.t_level) {
1008  ++level; // level was not increased in teams construct for
1009  // team_of_workers before the parallel
1010  } // team->t.t_level will be increased inside parallel
1011  }
1012  if (level < __kmp_hot_teams_max_level) {
1013  if (hot_teams[level].hot_team) {
1014  // hot team has already been allocated for given level
1015  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1016  use_hot_team = 1; // the team is ready to use
1017  } else {
1018  use_hot_team = 0; // AC: threads are not allocated yet
1019  hot_teams[level].hot_team = team; // remember new hot team
1020  hot_teams[level].hot_team_nth = team->t.t_nproc;
1021  }
1022  } else {
1023  use_hot_team = 0;
1024  }
1025  }
1026 #else
1027  use_hot_team = team == root->r.r_hot_team;
1028 #endif
1029  if (!use_hot_team) {
1030 
1031  /* install the master thread */
1032  team->t.t_threads[0] = master_th;
1033  __kmp_initialize_info(master_th, team, 0, master_gtid);
1034 
1035  /* now, install the worker threads */
1036  for (i = 1; i < team->t.t_nproc; i++) {
1037 
1038  /* fork or reallocate a new thread and install it in team */
1039  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1040  team->t.t_threads[i] = thr;
1041  KMP_DEBUG_ASSERT(thr);
1042  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1043  /* align team and thread arrived states */
1044  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1045  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1046  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1047  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1048  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1049  team->t.t_bar[bs_plain_barrier].b_arrived));
1050  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1051  thr->th.th_teams_level = master_th->th.th_teams_level;
1052  thr->th.th_teams_size = master_th->th.th_teams_size;
1053  { // Initialize threads' barrier data.
1054  int b;
1055  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1056  for (b = 0; b < bs_last_barrier; ++b) {
1057  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1058  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1059 #if USE_DEBUGGER
1060  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1061 #endif
1062  }
1063  }
1064  }
1065 
1066 #if KMP_AFFINITY_SUPPORTED
1067  __kmp_partition_places(team);
1068 #endif
1069  }
1070 
1071  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1072  for (i = 0; i < team->t.t_nproc; i++) {
1073  kmp_info_t *thr = team->t.t_threads[i];
1074  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1075  thr->th.th_prev_level != team->t.t_level) {
1076  team->t.t_display_affinity = 1;
1077  break;
1078  }
1079  }
1080  }
1081 
1082  KMP_MB();
1083 }
1084 
1085 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1086 // Propagate any changes to the floating point control registers out to the team
1087 // We try to avoid unnecessary writes to the relevant cache line in the team
1088 // structure, so we don't make changes unless they are needed.
1089 inline static void propagateFPControl(kmp_team_t *team) {
1090  if (__kmp_inherit_fp_control) {
1091  kmp_int16 x87_fpu_control_word;
1092  kmp_uint32 mxcsr;
1093 
1094  // Get master values of FPU control flags (both X87 and vector)
1095  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1096  __kmp_store_mxcsr(&mxcsr);
1097  mxcsr &= KMP_X86_MXCSR_MASK;
1098 
1099  // There is no point looking at t_fp_control_saved here.
1100  // If it is TRUE, we still have to update the values if they are different
1101  // from those we now have. If it is FALSE we didn't save anything yet, but
1102  // our objective is the same. We have to ensure that the values in the team
1103  // are the same as those we have.
1104  // So, this code achieves what we need whether or not t_fp_control_saved is
1105  // true. By checking whether the value needs updating we avoid unnecessary
1106  // writes that would put the cache-line into a written state, causing all
1107  // threads in the team to have to read it again.
1108  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1109  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1110  // Although we don't use this value, other code in the runtime wants to know
1111  // whether it should restore them. So we must ensure it is correct.
1112  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1113  } else {
1114  // Similarly here. Don't write to this cache-line in the team structure
1115  // unless we have to.
1116  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1117  }
1118 }
1119 
1120 // Do the opposite, setting the hardware registers to the updated values from
1121 // the team.
1122 inline static void updateHWFPControl(kmp_team_t *team) {
1123  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1124  // Only reset the fp control regs if they have been changed in the team.
1125  // the parallel region that we are exiting.
1126  kmp_int16 x87_fpu_control_word;
1127  kmp_uint32 mxcsr;
1128  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1129  __kmp_store_mxcsr(&mxcsr);
1130  mxcsr &= KMP_X86_MXCSR_MASK;
1131 
1132  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1133  __kmp_clear_x87_fpu_status_word();
1134  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1135  }
1136 
1137  if (team->t.t_mxcsr != mxcsr) {
1138  __kmp_load_mxcsr(&team->t.t_mxcsr);
1139  }
1140  }
1141 }
1142 #else
1143 #define propagateFPControl(x) ((void)0)
1144 #define updateHWFPControl(x) ((void)0)
1145 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1146 
1147 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1148  int realloc); // forward declaration
1149 
1150 /* Run a parallel region that has been serialized, so runs only in a team of the
1151  single master thread. */
1152 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1153  kmp_info_t *this_thr;
1154  kmp_team_t *serial_team;
1155 
1156  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1157 
1158  /* Skip all this code for autopar serialized loops since it results in
1159  unacceptable overhead */
1160  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1161  return;
1162 
1163  if (!TCR_4(__kmp_init_parallel))
1164  __kmp_parallel_initialize();
1165  __kmp_resume_if_soft_paused();
1166 
1167  this_thr = __kmp_threads[global_tid];
1168  serial_team = this_thr->th.th_serial_team;
1169 
1170  /* utilize the serialized team held by this thread */
1171  KMP_DEBUG_ASSERT(serial_team);
1172  KMP_MB();
1173 
1174  if (__kmp_tasking_mode != tskm_immediate_exec) {
1175  KMP_DEBUG_ASSERT(
1176  this_thr->th.th_task_team ==
1177  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1178  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1179  NULL);
1180  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1181  "team %p, new task_team = NULL\n",
1182  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1183  this_thr->th.th_task_team = NULL;
1184  }
1185 
1186  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1187  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1188  proc_bind = proc_bind_false;
1189  } else if (proc_bind == proc_bind_default) {
1190  // No proc_bind clause was specified, so use the current value
1191  // of proc-bind-var for this parallel region.
1192  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1193  }
1194  // Reset for next parallel region
1195  this_thr->th.th_set_proc_bind = proc_bind_default;
1196 
1197 #if OMPT_SUPPORT
1198  ompt_data_t ompt_parallel_data = ompt_data_none;
1199  ompt_data_t *implicit_task_data;
1200  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1201  if (ompt_enabled.enabled &&
1202  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1203 
1204  ompt_task_info_t *parent_task_info;
1205  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1206 
1207  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1208  if (ompt_enabled.ompt_callback_parallel_begin) {
1209  int team_size = 1;
1210 
1211  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1212  &(parent_task_info->task_data), &(parent_task_info->frame),
1213  &ompt_parallel_data, team_size,
1214  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1215  }
1216  }
1217 #endif // OMPT_SUPPORT
1218 
1219  if (this_thr->th.th_team != serial_team) {
1220  // Nested level will be an index in the nested nthreads array
1221  int level = this_thr->th.th_team->t.t_level;
1222 
1223  if (serial_team->t.t_serialized) {
1224  /* this serial team was already used
1225  TODO increase performance by making this locks more specific */
1226  kmp_team_t *new_team;
1227 
1228  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1229 
1230  new_team =
1231  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1232 #if OMPT_SUPPORT
1233  ompt_parallel_data,
1234 #endif
1235  proc_bind, &this_thr->th.th_current_task->td_icvs,
1236  0 USE_NESTED_HOT_ARG(NULL));
1237  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1238  KMP_ASSERT(new_team);
1239 
1240  /* setup new serialized team and install it */
1241  new_team->t.t_threads[0] = this_thr;
1242  new_team->t.t_parent = this_thr->th.th_team;
1243  serial_team = new_team;
1244  this_thr->th.th_serial_team = serial_team;
1245 
1246  KF_TRACE(
1247  10,
1248  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1249  global_tid, serial_team));
1250 
1251  /* TODO the above breaks the requirement that if we run out of resources,
1252  then we can still guarantee that serialized teams are ok, since we may
1253  need to allocate a new one */
1254  } else {
1255  KF_TRACE(
1256  10,
1257  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1258  global_tid, serial_team));
1259  }
1260 
1261  /* we have to initialize this serial team */
1262  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1263  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1264  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1265  serial_team->t.t_ident = loc;
1266  serial_team->t.t_serialized = 1;
1267  serial_team->t.t_nproc = 1;
1268  serial_team->t.t_parent = this_thr->th.th_team;
1269  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1270  this_thr->th.th_team = serial_team;
1271  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1272 
1273  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1274  this_thr->th.th_current_task));
1275  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1276  this_thr->th.th_current_task->td_flags.executing = 0;
1277 
1278  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1279 
1280  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1281  implicit task for each serialized task represented by
1282  team->t.t_serialized? */
1283  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1284  &this_thr->th.th_current_task->td_parent->td_icvs);
1285 
1286  // Thread value exists in the nested nthreads array for the next nested
1287  // level
1288  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1289  this_thr->th.th_current_task->td_icvs.nproc =
1290  __kmp_nested_nth.nth[level + 1];
1291  }
1292 
1293  if (__kmp_nested_proc_bind.used &&
1294  (level + 1 < __kmp_nested_proc_bind.used)) {
1295  this_thr->th.th_current_task->td_icvs.proc_bind =
1296  __kmp_nested_proc_bind.bind_types[level + 1];
1297  }
1298 
1299 #if USE_DEBUGGER
1300  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1301 #endif
1302  this_thr->th.th_info.ds.ds_tid = 0;
1303 
1304  /* set thread cache values */
1305  this_thr->th.th_team_nproc = 1;
1306  this_thr->th.th_team_master = this_thr;
1307  this_thr->th.th_team_serialized = 1;
1308 
1309  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1310  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1311  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1312 
1313  propagateFPControl(serial_team);
1314 
1315  /* check if we need to allocate dispatch buffers stack */
1316  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1317  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1318  serial_team->t.t_dispatch->th_disp_buffer =
1319  (dispatch_private_info_t *)__kmp_allocate(
1320  sizeof(dispatch_private_info_t));
1321  }
1322  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1323 
1324  KMP_MB();
1325 
1326  } else {
1327  /* this serialized team is already being used,
1328  * that's fine, just add another nested level */
1329  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1330  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1331  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1332  ++serial_team->t.t_serialized;
1333  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1334 
1335  // Nested level will be an index in the nested nthreads array
1336  int level = this_thr->th.th_team->t.t_level;
1337  // Thread value exists in the nested nthreads array for the next nested
1338  // level
1339  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1340  this_thr->th.th_current_task->td_icvs.nproc =
1341  __kmp_nested_nth.nth[level + 1];
1342  }
1343  serial_team->t.t_level++;
1344  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1345  "of serial team %p to %d\n",
1346  global_tid, serial_team, serial_team->t.t_level));
1347 
1348  /* allocate/push dispatch buffers stack */
1349  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350  {
1351  dispatch_private_info_t *disp_buffer =
1352  (dispatch_private_info_t *)__kmp_allocate(
1353  sizeof(dispatch_private_info_t));
1354  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1355  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1356  }
1357  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1358 
1359  KMP_MB();
1360  }
1361  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1362 
1363  // Perform the display affinity functionality for
1364  // serialized parallel regions
1365  if (__kmp_display_affinity) {
1366  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1367  this_thr->th.th_prev_num_threads != 1) {
1368  // NULL means use the affinity-format-var ICV
1369  __kmp_aux_display_affinity(global_tid, NULL);
1370  this_thr->th.th_prev_level = serial_team->t.t_level;
1371  this_thr->th.th_prev_num_threads = 1;
1372  }
1373  }
1374 
1375  if (__kmp_env_consistency_check)
1376  __kmp_push_parallel(global_tid, NULL);
1377 #if OMPT_SUPPORT
1378  serial_team->t.ompt_team_info.master_return_address = codeptr;
1379  if (ompt_enabled.enabled &&
1380  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1381  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1382 
1383  ompt_lw_taskteam_t lw_taskteam;
1384  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1385  &ompt_parallel_data, codeptr);
1386 
1387  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1388  // don't use lw_taskteam after linking. content was swaped
1389 
1390  /* OMPT implicit task begin */
1391  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1392  if (ompt_enabled.ompt_callback_implicit_task) {
1393  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1394  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1395  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1396  OMPT_CUR_TASK_INFO(this_thr)
1397  ->thread_num = __kmp_tid_from_gtid(global_tid);
1398  }
1399 
1400  /* OMPT state */
1401  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1402  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1403  }
1404 #endif
1405 }
1406 
1407 /* most of the work for a fork */
1408 /* return true if we really went parallel, false if serialized */
1409 int __kmp_fork_call(ident_t *loc, int gtid,
1410  enum fork_context_e call_context, // Intel, GNU, ...
1411  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1412  kmp_va_list ap) {
1413  void **argv;
1414  int i;
1415  int master_tid;
1416  int master_this_cons;
1417  kmp_team_t *team;
1418  kmp_team_t *parent_team;
1419  kmp_info_t *master_th;
1420  kmp_root_t *root;
1421  int nthreads;
1422  int master_active;
1423  int master_set_numthreads;
1424  int level;
1425  int active_level;
1426  int teams_level;
1427 #if KMP_NESTED_HOT_TEAMS
1428  kmp_hot_team_ptr_t **p_hot_teams;
1429 #endif
1430  { // KMP_TIME_BLOCK
1431  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1432  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1433 
1434  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1435  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1436  /* Some systems prefer the stack for the root thread(s) to start with */
1437  /* some gap from the parent stack to prevent false sharing. */
1438  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1439  /* These 2 lines below are so this does not get optimized out */
1440  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1441  __kmp_stkpadding += (short)((kmp_int64)dummy);
1442  }
1443 
1444  /* initialize if needed */
1445  KMP_DEBUG_ASSERT(
1446  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1447  if (!TCR_4(__kmp_init_parallel))
1448  __kmp_parallel_initialize();
1449  __kmp_resume_if_soft_paused();
1450 
1451  /* setup current data */
1452  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1453  // shutdown
1454  parent_team = master_th->th.th_team;
1455  master_tid = master_th->th.th_info.ds.ds_tid;
1456  master_this_cons = master_th->th.th_local.this_construct;
1457  root = master_th->th.th_root;
1458  master_active = root->r.r_active;
1459  master_set_numthreads = master_th->th.th_set_nproc;
1460 
1461 #if OMPT_SUPPORT
1462  ompt_data_t ompt_parallel_data = ompt_data_none;
1463  ompt_data_t *parent_task_data;
1464  ompt_frame_t *ompt_frame;
1465  ompt_data_t *implicit_task_data;
1466  void *return_address = NULL;
1467 
1468  if (ompt_enabled.enabled) {
1469  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1470  NULL, NULL);
1471  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1472  }
1473 #endif
1474 
1475  // Nested level will be an index in the nested nthreads array
1476  level = parent_team->t.t_level;
1477  // used to launch non-serial teams even if nested is not allowed
1478  active_level = parent_team->t.t_active_level;
1479  // needed to check nesting inside the teams
1480  teams_level = master_th->th.th_teams_level;
1481 #if KMP_NESTED_HOT_TEAMS
1482  p_hot_teams = &master_th->th.th_hot_teams;
1483  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1484  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1485  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1486  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1487  // it is either actual or not needed (when active_level > 0)
1488  (*p_hot_teams)[0].hot_team_nth = 1;
1489  }
1490 #endif
1491 
1492 #if OMPT_SUPPORT
1493  if (ompt_enabled.enabled) {
1494  if (ompt_enabled.ompt_callback_parallel_begin) {
1495  int team_size = master_set_numthreads
1496  ? master_set_numthreads
1497  : get__nproc_2(parent_team, master_tid);
1498  int flags = OMPT_INVOKER(call_context) |
1499  ((microtask == (microtask_t)__kmp_teams_master)
1500  ? ompt_parallel_league
1501  : ompt_parallel_team);
1502  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1503  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1504  return_address);
1505  }
1506  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1507  }
1508 #endif
1509 
1510  master_th->th.th_ident = loc;
1511 
1512  if (master_th->th.th_teams_microtask && ap &&
1513  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1514  // AC: This is start of parallel that is nested inside teams construct.
1515  // The team is actual (hot), all workers are ready at the fork barrier.
1516  // No lock needed to initialize the team a bit, then free workers.
1517  parent_team->t.t_ident = loc;
1518  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1519  parent_team->t.t_argc = argc;
1520  argv = (void **)parent_team->t.t_argv;
1521  for (i = argc - 1; i >= 0; --i)
1522  *argv++ = va_arg(kmp_va_deref(ap), void *);
1523  // Increment our nested depth levels, but not increase the serialization
1524  if (parent_team == master_th->th.th_serial_team) {
1525  // AC: we are in serialized parallel
1526  __kmpc_serialized_parallel(loc, gtid);
1527  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1528 
1529  if (call_context == fork_context_gnu) {
1530  // AC: need to decrement t_serialized for enquiry functions to work
1531  // correctly, will restore at join time
1532  parent_team->t.t_serialized--;
1533  return TRUE;
1534  }
1535 
1536 #if OMPT_SUPPORT
1537  void *dummy;
1538  void **exit_frame_p;
1539 
1540  ompt_lw_taskteam_t lw_taskteam;
1541 
1542  if (ompt_enabled.enabled) {
1543  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1544  &ompt_parallel_data, return_address);
1545  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1546 
1547  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1548  // don't use lw_taskteam after linking. content was swaped
1549 
1550  /* OMPT implicit task begin */
1551  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1552  if (ompt_enabled.ompt_callback_implicit_task) {
1553  OMPT_CUR_TASK_INFO(master_th)
1554  ->thread_num = __kmp_tid_from_gtid(gtid);
1555  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1556  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1557  implicit_task_data, 1,
1558  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1559  }
1560 
1561  /* OMPT state */
1562  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1563  } else {
1564  exit_frame_p = &dummy;
1565  }
1566 #endif
1567  // AC: need to decrement t_serialized for enquiry functions to work
1568  // correctly, will restore at join time
1569  parent_team->t.t_serialized--;
1570 
1571  {
1572  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1573  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1574  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1575 #if OMPT_SUPPORT
1576  ,
1577  exit_frame_p
1578 #endif
1579  );
1580  }
1581 
1582 #if OMPT_SUPPORT
1583  if (ompt_enabled.enabled) {
1584  *exit_frame_p = NULL;
1585  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1586  if (ompt_enabled.ompt_callback_implicit_task) {
1587  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1588  ompt_scope_end, NULL, implicit_task_data, 1,
1589  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1590  }
1591  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1592  __ompt_lw_taskteam_unlink(master_th);
1593  if (ompt_enabled.ompt_callback_parallel_end) {
1594  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1595  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1596  OMPT_INVOKER(call_context) | ompt_parallel_team,
1597  return_address);
1598  }
1599  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1600  }
1601 #endif
1602  return TRUE;
1603  }
1604 
1605  parent_team->t.t_pkfn = microtask;
1606  parent_team->t.t_invoke = invoker;
1607  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1608  parent_team->t.t_active_level++;
1609  parent_team->t.t_level++;
1610  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1611 
1612 #if OMPT_SUPPORT
1613  if (ompt_enabled.enabled) {
1614  ompt_lw_taskteam_t lw_taskteam;
1615  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1616  &ompt_parallel_data, return_address);
1617  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1618  }
1619 #endif
1620 
1621  /* Change number of threads in the team if requested */
1622  if (master_set_numthreads) { // The parallel has num_threads clause
1623  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1624  // AC: only can reduce number of threads dynamically, can't increase
1625  kmp_info_t **other_threads = parent_team->t.t_threads;
1626  parent_team->t.t_nproc = master_set_numthreads;
1627  for (i = 0; i < master_set_numthreads; ++i) {
1628  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1629  }
1630  // Keep extra threads hot in the team for possible next parallels
1631  }
1632  master_th->th.th_set_nproc = 0;
1633  }
1634 
1635 #if USE_DEBUGGER
1636  if (__kmp_debugging) { // Let debugger override number of threads.
1637  int nth = __kmp_omp_num_threads(loc);
1638  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1639  master_set_numthreads = nth;
1640  }
1641  }
1642 #endif
1643 
1644 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1645  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1646  KMP_ITT_DEBUG) &&
1647  __kmp_forkjoin_frames_mode == 3 &&
1648  parent_team->t.t_active_level == 1 // only report frames at level 1
1649  && master_th->th.th_teams_size.nteams == 1) {
1650  kmp_uint64 tmp_time = __itt_get_timestamp();
1651  master_th->th.th_frame_time = tmp_time;
1652  parent_team->t.t_region_time = tmp_time;
1653  }
1654  if (__itt_stack_caller_create_ptr) {
1655  // create new stack stitching id before entering fork barrier
1656  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1657  }
1658 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1659 
1660  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1661  "master_th=%p, gtid=%d\n",
1662  root, parent_team, master_th, gtid));
1663  __kmp_internal_fork(loc, gtid, parent_team);
1664  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1665  "master_th=%p, gtid=%d\n",
1666  root, parent_team, master_th, gtid));
1667 
1668  if (call_context == fork_context_gnu)
1669  return TRUE;
1670 
1671  /* Invoke microtask for MASTER thread */
1672  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1673  parent_team->t.t_id, parent_team->t.t_pkfn));
1674 
1675  if (!parent_team->t.t_invoke(gtid)) {
1676  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1677  }
1678  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1679  parent_team->t.t_id, parent_team->t.t_pkfn));
1680  KMP_MB(); /* Flush all pending memory write invalidates. */
1681 
1682  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1683 
1684  return TRUE;
1685  } // Parallel closely nested in teams construct
1686 
1687 #if KMP_DEBUG
1688  if (__kmp_tasking_mode != tskm_immediate_exec) {
1689  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1690  parent_team->t.t_task_team[master_th->th.th_task_state]);
1691  }
1692 #endif
1693 
1694  if (parent_team->t.t_active_level >=
1695  master_th->th.th_current_task->td_icvs.max_active_levels) {
1696  nthreads = 1;
1697  } else {
1698  int enter_teams = ((ap == NULL && active_level == 0) ||
1699  (ap && teams_level > 0 && teams_level == level));
1700  nthreads =
1701  master_set_numthreads
1702  ? master_set_numthreads
1703  : get__nproc_2(
1704  parent_team,
1705  master_tid); // TODO: get nproc directly from current task
1706 
1707  // Check if we need to take forkjoin lock? (no need for serialized
1708  // parallel out of teams construct). This code moved here from
1709  // __kmp_reserve_threads() to speedup nested serialized parallels.
1710  if (nthreads > 1) {
1711  if ((get__max_active_levels(master_th) == 1 &&
1712  (root->r.r_in_parallel && !enter_teams)) ||
1713  (__kmp_library == library_serial)) {
1714  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1715  " threads\n",
1716  gtid, nthreads));
1717  nthreads = 1;
1718  }
1719  }
1720  if (nthreads > 1) {
1721  /* determine how many new threads we can use */
1722  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1723  /* AC: If we execute teams from parallel region (on host), then teams
1724  should be created but each can only have 1 thread if nesting is
1725  disabled. If teams called from serial region, then teams and their
1726  threads should be created regardless of the nesting setting. */
1727  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1728  nthreads, enter_teams);
1729  if (nthreads == 1) {
1730  // Free lock for single thread execution here; for multi-thread
1731  // execution it will be freed later after team of threads created
1732  // and initialized
1733  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1734  }
1735  }
1736  }
1737  KMP_DEBUG_ASSERT(nthreads > 0);
1738 
1739  // If we temporarily changed the set number of threads then restore it now
1740  master_th->th.th_set_nproc = 0;
1741 
1742  /* create a serialized parallel region? */
1743  if (nthreads == 1) {
1744 /* josh todo: hypothetical question: what do we do for OS X*? */
1745 #if KMP_OS_LINUX && \
1746  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1747  void *args[argc];
1748 #else
1749  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1750 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1751  KMP_ARCH_AARCH64) */
1752 
1753  KA_TRACE(20,
1754  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1755 
1756  __kmpc_serialized_parallel(loc, gtid);
1757 
1758  if (call_context == fork_context_intel) {
1759  /* TODO this sucks, use the compiler itself to pass args! :) */
1760  master_th->th.th_serial_team->t.t_ident = loc;
1761  if (!ap) {
1762  // revert change made in __kmpc_serialized_parallel()
1763  master_th->th.th_serial_team->t.t_level--;
1764 // Get args from parent team for teams construct
1765 
1766 #if OMPT_SUPPORT
1767  void *dummy;
1768  void **exit_frame_p;
1769  ompt_task_info_t *task_info;
1770 
1771  ompt_lw_taskteam_t lw_taskteam;
1772 
1773  if (ompt_enabled.enabled) {
1774  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1775  &ompt_parallel_data, return_address);
1776 
1777  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1778  // don't use lw_taskteam after linking. content was swaped
1779 
1780  task_info = OMPT_CUR_TASK_INFO(master_th);
1781  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1782  if (ompt_enabled.ompt_callback_implicit_task) {
1783  OMPT_CUR_TASK_INFO(master_th)
1784  ->thread_num = __kmp_tid_from_gtid(gtid);
1785  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1786  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1787  &(task_info->task_data), 1,
1788  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1789  ompt_task_implicit);
1790  }
1791 
1792  /* OMPT state */
1793  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1794  } else {
1795  exit_frame_p = &dummy;
1796  }
1797 #endif
1798 
1799  {
1800  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1801  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1802  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1803  parent_team->t.t_argv
1804 #if OMPT_SUPPORT
1805  ,
1806  exit_frame_p
1807 #endif
1808  );
1809  }
1810 
1811 #if OMPT_SUPPORT
1812  if (ompt_enabled.enabled) {
1813  *exit_frame_p = NULL;
1814  if (ompt_enabled.ompt_callback_implicit_task) {
1815  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1816  ompt_scope_end, NULL, &(task_info->task_data), 1,
1817  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1818  ompt_task_implicit);
1819  }
1820  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1821  __ompt_lw_taskteam_unlink(master_th);
1822  if (ompt_enabled.ompt_callback_parallel_end) {
1823  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1824  &ompt_parallel_data, parent_task_data,
1825  OMPT_INVOKER(call_context) | ompt_parallel_team,
1826  return_address);
1827  }
1828  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829  }
1830 #endif
1831  } else if (microtask == (microtask_t)__kmp_teams_master) {
1832  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1833  master_th->th.th_serial_team);
1834  team = master_th->th.th_team;
1835  // team->t.t_pkfn = microtask;
1836  team->t.t_invoke = invoker;
1837  __kmp_alloc_argv_entries(argc, team, TRUE);
1838  team->t.t_argc = argc;
1839  argv = (void **)team->t.t_argv;
1840  if (ap) {
1841  for (i = argc - 1; i >= 0; --i)
1842  *argv++ = va_arg(kmp_va_deref(ap), void *);
1843  } else {
1844  for (i = 0; i < argc; ++i)
1845  // Get args from parent team for teams construct
1846  argv[i] = parent_team->t.t_argv[i];
1847  }
1848  // AC: revert change made in __kmpc_serialized_parallel()
1849  // because initial code in teams should have level=0
1850  team->t.t_level--;
1851  // AC: call special invoker for outer "parallel" of teams construct
1852  invoker(gtid);
1853 #if OMPT_SUPPORT
1854  if (ompt_enabled.enabled) {
1855  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1856  if (ompt_enabled.ompt_callback_implicit_task) {
1857  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1858  ompt_scope_end, NULL, &(task_info->task_data), 0,
1859  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1860  }
1861  if (ompt_enabled.ompt_callback_parallel_end) {
1862  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1863  &ompt_parallel_data, parent_task_data,
1864  OMPT_INVOKER(call_context) | ompt_parallel_league,
1865  return_address);
1866  }
1867  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1868  }
1869 #endif
1870  } else {
1871  argv = args;
1872  for (i = argc - 1; i >= 0; --i)
1873  *argv++ = va_arg(kmp_va_deref(ap), void *);
1874  KMP_MB();
1875 
1876 #if OMPT_SUPPORT
1877  void *dummy;
1878  void **exit_frame_p;
1879  ompt_task_info_t *task_info;
1880 
1881  ompt_lw_taskteam_t lw_taskteam;
1882 
1883  if (ompt_enabled.enabled) {
1884  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1885  &ompt_parallel_data, return_address);
1886  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1887  // don't use lw_taskteam after linking. content was swaped
1888  task_info = OMPT_CUR_TASK_INFO(master_th);
1889  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1890 
1891  /* OMPT implicit task begin */
1892  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1893  if (ompt_enabled.ompt_callback_implicit_task) {
1894  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1895  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1896  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1897  ompt_task_implicit);
1898  OMPT_CUR_TASK_INFO(master_th)
1899  ->thread_num = __kmp_tid_from_gtid(gtid);
1900  }
1901 
1902  /* OMPT state */
1903  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1904  } else {
1905  exit_frame_p = &dummy;
1906  }
1907 #endif
1908 
1909  {
1910  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1911  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1912  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1913 #if OMPT_SUPPORT
1914  ,
1915  exit_frame_p
1916 #endif
1917  );
1918  }
1919 
1920 #if OMPT_SUPPORT
1921  if (ompt_enabled.enabled) {
1922  *exit_frame_p = NULL;
1923  if (ompt_enabled.ompt_callback_implicit_task) {
1924  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1925  ompt_scope_end, NULL, &(task_info->task_data), 1,
1926  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1927  ompt_task_implicit);
1928  }
1929 
1930  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1931  __ompt_lw_taskteam_unlink(master_th);
1932  if (ompt_enabled.ompt_callback_parallel_end) {
1933  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1934  &ompt_parallel_data, parent_task_data,
1935  OMPT_INVOKER(call_context) | ompt_parallel_team,
1936  return_address);
1937  }
1938  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1939  }
1940 #endif
1941  }
1942  } else if (call_context == fork_context_gnu) {
1943 #if OMPT_SUPPORT
1944  ompt_lw_taskteam_t lwt;
1945  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1946  return_address);
1947 
1948  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1949  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1950 // don't use lw_taskteam after linking. content was swaped
1951 #endif
1952 
1953  // we were called from GNU native code
1954  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1955  return FALSE;
1956  } else {
1957  KMP_ASSERT2(call_context < fork_context_last,
1958  "__kmp_fork_call: unknown fork_context parameter");
1959  }
1960 
1961  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1962  KMP_MB();
1963  return FALSE;
1964  } // if (nthreads == 1)
1965 
1966  // GEH: only modify the executing flag in the case when not serialized
1967  // serialized case is handled in kmpc_serialized_parallel
1968  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1969  "curtask=%p, curtask_max_aclevel=%d\n",
1970  parent_team->t.t_active_level, master_th,
1971  master_th->th.th_current_task,
1972  master_th->th.th_current_task->td_icvs.max_active_levels));
1973  // TODO: GEH - cannot do this assertion because root thread not set up as
1974  // executing
1975  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1976  master_th->th.th_current_task->td_flags.executing = 0;
1977 
1978  if (!master_th->th.th_teams_microtask || level > teams_level) {
1979  /* Increment our nested depth level */
1980  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1981  }
1982 
1983  // See if we need to make a copy of the ICVs.
1984  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1985  if ((level + 1 < __kmp_nested_nth.used) &&
1986  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1987  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1988  } else {
1989  nthreads_icv = 0; // don't update
1990  }
1991 
1992  // Figure out the proc_bind_policy for the new team.
1993  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1994  kmp_proc_bind_t proc_bind_icv =
1995  proc_bind_default; // proc_bind_default means don't update
1996  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1997  proc_bind = proc_bind_false;
1998  } else {
1999  if (proc_bind == proc_bind_default) {
2000  // No proc_bind clause specified; use current proc-bind-var for this
2001  // parallel region
2002  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2003  }
2004  /* else: The proc_bind policy was specified explicitly on parallel clause.
2005  This overrides proc-bind-var for this parallel region, but does not
2006  change proc-bind-var. */
2007  // Figure the value of proc-bind-var for the child threads.
2008  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2009  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2010  master_th->th.th_current_task->td_icvs.proc_bind)) {
2011  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2012  }
2013  }
2014 
2015  // Reset for next parallel region
2016  master_th->th.th_set_proc_bind = proc_bind_default;
2017 
2018  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2019  kmp_internal_control_t new_icvs;
2020  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2021  new_icvs.next = NULL;
2022  if (nthreads_icv > 0) {
2023  new_icvs.nproc = nthreads_icv;
2024  }
2025  if (proc_bind_icv != proc_bind_default) {
2026  new_icvs.proc_bind = proc_bind_icv;
2027  }
2028 
2029  /* allocate a new parallel team */
2030  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2031  team = __kmp_allocate_team(root, nthreads, nthreads,
2032 #if OMPT_SUPPORT
2033  ompt_parallel_data,
2034 #endif
2035  proc_bind, &new_icvs,
2036  argc USE_NESTED_HOT_ARG(master_th));
2037  } else {
2038  /* allocate a new parallel team */
2039  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2040  team = __kmp_allocate_team(root, nthreads, nthreads,
2041 #if OMPT_SUPPORT
2042  ompt_parallel_data,
2043 #endif
2044  proc_bind,
2045  &master_th->th.th_current_task->td_icvs,
2046  argc USE_NESTED_HOT_ARG(master_th));
2047  }
2048  KF_TRACE(
2049  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2050 
2051  /* setup the new team */
2052  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2053  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2054  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2055  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2056  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2057 #if OMPT_SUPPORT
2058  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2059  return_address);
2060 #endif
2061  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2062  // TODO: parent_team->t.t_level == INT_MAX ???
2063  if (!master_th->th.th_teams_microtask || level > teams_level) {
2064  int new_level = parent_team->t.t_level + 1;
2065  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2066  new_level = parent_team->t.t_active_level + 1;
2067  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2068  } else {
2069  // AC: Do not increase parallel level at start of the teams construct
2070  int new_level = parent_team->t.t_level;
2071  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2072  new_level = parent_team->t.t_active_level;
2073  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2074  }
2075  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2076  // set master's schedule as new run-time schedule
2077  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2078 
2079  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2080  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2081 
2082  // Update the floating point rounding in the team if required.
2083  propagateFPControl(team);
2084 
2085  if (__kmp_tasking_mode != tskm_immediate_exec) {
2086  // Set master's task team to team's task team. Unless this is hot team, it
2087  // should be NULL.
2088  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2089  parent_team->t.t_task_team[master_th->th.th_task_state]);
2090  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2091  "%p, new task_team %p / team %p\n",
2092  __kmp_gtid_from_thread(master_th),
2093  master_th->th.th_task_team, parent_team,
2094  team->t.t_task_team[master_th->th.th_task_state], team));
2095 
2096  if (active_level || master_th->th.th_task_team) {
2097  // Take a memo of master's task_state
2098  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2099  if (master_th->th.th_task_state_top >=
2100  master_th->th.th_task_state_stack_sz) { // increase size
2101  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2102  kmp_uint8 *old_stack, *new_stack;
2103  kmp_uint32 i;
2104  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2105  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2106  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2107  }
2108  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2109  ++i) { // zero-init rest of stack
2110  new_stack[i] = 0;
2111  }
2112  old_stack = master_th->th.th_task_state_memo_stack;
2113  master_th->th.th_task_state_memo_stack = new_stack;
2114  master_th->th.th_task_state_stack_sz = new_size;
2115  __kmp_free(old_stack);
2116  }
2117  // Store master's task_state on stack
2118  master_th->th
2119  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2120  master_th->th.th_task_state;
2121  master_th->th.th_task_state_top++;
2122 #if KMP_NESTED_HOT_TEAMS
2123  if (master_th->th.th_hot_teams &&
2124  active_level < __kmp_hot_teams_max_level &&
2125  team == master_th->th.th_hot_teams[active_level].hot_team) {
2126  // Restore master's nested state if nested hot team
2127  master_th->th.th_task_state =
2128  master_th->th
2129  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2130  } else {
2131 #endif
2132  master_th->th.th_task_state = 0;
2133 #if KMP_NESTED_HOT_TEAMS
2134  }
2135 #endif
2136  }
2137 #if !KMP_NESTED_HOT_TEAMS
2138  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2139  (team == root->r.r_hot_team));
2140 #endif
2141  }
2142 
2143  KA_TRACE(
2144  20,
2145  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2146  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2147  team->t.t_nproc));
2148  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2149  (team->t.t_master_tid == 0 &&
2150  (team->t.t_parent == root->r.r_root_team ||
2151  team->t.t_parent->t.t_serialized)));
2152  KMP_MB();
2153 
2154  /* now, setup the arguments */
2155  argv = (void **)team->t.t_argv;
2156  if (ap) {
2157  for (i = argc - 1; i >= 0; --i) {
2158  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2159  KMP_CHECK_UPDATE(*argv, new_argv);
2160  argv++;
2161  }
2162  } else {
2163  for (i = 0; i < argc; ++i) {
2164  // Get args from parent team for teams construct
2165  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2166  }
2167  }
2168 
2169  /* now actually fork the threads */
2170  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2171  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2172  root->r.r_active = TRUE;
2173 
2174  __kmp_fork_team_threads(root, team, master_th, gtid);
2175  __kmp_setup_icv_copy(team, nthreads,
2176  &master_th->th.th_current_task->td_icvs, loc);
2177 
2178 #if OMPT_SUPPORT
2179  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2180 #endif
2181 
2182  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2183 
2184 #if USE_ITT_BUILD
2185  if (team->t.t_active_level == 1 // only report frames at level 1
2186  && !master_th->th.th_teams_microtask) { // not in teams construct
2187 #if USE_ITT_NOTIFY
2188  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2189  (__kmp_forkjoin_frames_mode == 3 ||
2190  __kmp_forkjoin_frames_mode == 1)) {
2191  kmp_uint64 tmp_time = 0;
2192  if (__itt_get_timestamp_ptr)
2193  tmp_time = __itt_get_timestamp();
2194  // Internal fork - report frame begin
2195  master_th->th.th_frame_time = tmp_time;
2196  if (__kmp_forkjoin_frames_mode == 3)
2197  team->t.t_region_time = tmp_time;
2198  } else
2199 // only one notification scheme (either "submit" or "forking/joined", not both)
2200 #endif /* USE_ITT_NOTIFY */
2201  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2202  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2203  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2204  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2205  }
2206  }
2207 #endif /* USE_ITT_BUILD */
2208 
2209  /* now go on and do the work */
2210  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2211  KMP_MB();
2212  KF_TRACE(10,
2213  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2214  root, team, master_th, gtid));
2215 
2216 #if USE_ITT_BUILD
2217  if (__itt_stack_caller_create_ptr) {
2218  team->t.t_stack_id =
2219  __kmp_itt_stack_caller_create(); // create new stack stitching id
2220  // before entering fork barrier
2221  }
2222 #endif /* USE_ITT_BUILD */
2223 
2224  // AC: skip __kmp_internal_fork at teams construct, let only master
2225  // threads execute
2226  if (ap) {
2227  __kmp_internal_fork(loc, gtid, team);
2228  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2229  "master_th=%p, gtid=%d\n",
2230  root, team, master_th, gtid));
2231  }
2232 
2233  if (call_context == fork_context_gnu) {
2234  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2235  return TRUE;
2236  }
2237 
2238  /* Invoke microtask for MASTER thread */
2239  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2240  team->t.t_id, team->t.t_pkfn));
2241  } // END of timer KMP_fork_call block
2242 
2243 #if KMP_STATS_ENABLED
2244  // If beginning a teams construct, then change thread state
2245  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2246  if (!ap) {
2247  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2248  }
2249 #endif
2250 
2251  if (!team->t.t_invoke(gtid)) {
2252  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2253  }
2254 
2255 #if KMP_STATS_ENABLED
2256  // If was beginning of a teams construct, then reset thread state
2257  if (!ap) {
2258  KMP_SET_THREAD_STATE(previous_state);
2259  }
2260 #endif
2261 
2262  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2263  team->t.t_id, team->t.t_pkfn));
2264  KMP_MB(); /* Flush all pending memory write invalidates. */
2265 
2266  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2267 
2268 #if OMPT_SUPPORT
2269  if (ompt_enabled.enabled) {
2270  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2271  }
2272 #endif
2273 
2274  return TRUE;
2275 }
2276 
2277 #if OMPT_SUPPORT
2278 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2279  kmp_team_t *team) {
2280  // restore state outside the region
2281  thread->th.ompt_thread_info.state =
2282  ((team->t.t_serialized) ? ompt_state_work_serial
2283  : ompt_state_work_parallel);
2284 }
2285 
2286 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2287  kmp_team_t *team, ompt_data_t *parallel_data,
2288  int flags, void *codeptr) {
2289  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2290  if (ompt_enabled.ompt_callback_parallel_end) {
2291  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2292  parallel_data, &(task_info->task_data), flags, codeptr);
2293  }
2294 
2295  task_info->frame.enter_frame = ompt_data_none;
2296  __kmp_join_restore_state(thread, team);
2297 }
2298 #endif
2299 
2300 void __kmp_join_call(ident_t *loc, int gtid
2301 #if OMPT_SUPPORT
2302  ,
2303  enum fork_context_e fork_context
2304 #endif
2305  ,
2306  int exit_teams) {
2307  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2308  kmp_team_t *team;
2309  kmp_team_t *parent_team;
2310  kmp_info_t *master_th;
2311  kmp_root_t *root;
2312  int master_active;
2313 
2314  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2315 
2316  /* setup current data */
2317  master_th = __kmp_threads[gtid];
2318  root = master_th->th.th_root;
2319  team = master_th->th.th_team;
2320  parent_team = team->t.t_parent;
2321 
2322  master_th->th.th_ident = loc;
2323 
2324 #if OMPT_SUPPORT
2325  void *team_microtask = (void *)team->t.t_pkfn;
2326  // For GOMP interface with serialized parallel, need the
2327  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2328  // and end-parallel events.
2329  if (ompt_enabled.enabled &&
2330  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2331  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2332  }
2333 #endif
2334 
2335 #if KMP_DEBUG
2336  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2337  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2338  "th_task_team = %p\n",
2339  __kmp_gtid_from_thread(master_th), team,
2340  team->t.t_task_team[master_th->th.th_task_state],
2341  master_th->th.th_task_team));
2342  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2343  team->t.t_task_team[master_th->th.th_task_state]);
2344  }
2345 #endif
2346 
2347  if (team->t.t_serialized) {
2348  if (master_th->th.th_teams_microtask) {
2349  // We are in teams construct
2350  int level = team->t.t_level;
2351  int tlevel = master_th->th.th_teams_level;
2352  if (level == tlevel) {
2353  // AC: we haven't incremented it earlier at start of teams construct,
2354  // so do it here - at the end of teams construct
2355  team->t.t_level++;
2356  } else if (level == tlevel + 1) {
2357  // AC: we are exiting parallel inside teams, need to increment
2358  // serialization in order to restore it in the next call to
2359  // __kmpc_end_serialized_parallel
2360  team->t.t_serialized++;
2361  }
2362  }
2363  __kmpc_end_serialized_parallel(loc, gtid);
2364 
2365 #if OMPT_SUPPORT
2366  if (ompt_enabled.enabled) {
2367  __kmp_join_restore_state(master_th, parent_team);
2368  }
2369 #endif
2370 
2371  return;
2372  }
2373 
2374  master_active = team->t.t_master_active;
2375 
2376  if (!exit_teams) {
2377  // AC: No barrier for internal teams at exit from teams construct.
2378  // But there is barrier for external team (league).
2379  __kmp_internal_join(loc, gtid, team);
2380  } else {
2381  master_th->th.th_task_state =
2382  0; // AC: no tasking in teams (out of any parallel)
2383  }
2384 
2385  KMP_MB();
2386 
2387 #if OMPT_SUPPORT
2388  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2389  void *codeptr = team->t.ompt_team_info.master_return_address;
2390 #endif
2391 
2392 #if USE_ITT_BUILD
2393  if (__itt_stack_caller_create_ptr) {
2394  // destroy the stack stitching id after join barrier
2395  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2396  }
2397  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2398  if (team->t.t_active_level == 1 &&
2399  (!master_th->th.th_teams_microtask || /* not in teams construct */
2400  master_th->th.th_teams_size.nteams == 1)) {
2401  master_th->th.th_ident = loc;
2402  // only one notification scheme (either "submit" or "forking/joined", not
2403  // both)
2404  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2405  __kmp_forkjoin_frames_mode == 3)
2406  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2407  master_th->th.th_frame_time, 0, loc,
2408  master_th->th.th_team_nproc, 1);
2409  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2410  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2411  __kmp_itt_region_joined(gtid);
2412  } // active_level == 1
2413 #endif /* USE_ITT_BUILD */
2414 
2415  if (master_th->th.th_teams_microtask && !exit_teams &&
2416  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2417  team->t.t_level == master_th->th.th_teams_level + 1) {
2418 // AC: We need to leave the team structure intact at the end of parallel
2419 // inside the teams construct, so that at the next parallel same (hot) team
2420 // works, only adjust nesting levels
2421 #if OMPT_SUPPORT
2422  ompt_data_t ompt_parallel_data = ompt_data_none;
2423  if (ompt_enabled.enabled) {
2424  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2425  if (ompt_enabled.ompt_callback_implicit_task) {
2426  int ompt_team_size = team->t.t_nproc;
2427  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2428  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2429  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2430  }
2431  task_info->frame.exit_frame = ompt_data_none;
2432  task_info->task_data = ompt_data_none;
2433  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2434  __ompt_lw_taskteam_unlink(master_th);
2435  }
2436 #endif
2437  /* Decrement our nested depth level */
2438  team->t.t_level--;
2439  team->t.t_active_level--;
2440  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2441 
2442  // Restore number of threads in the team if needed. This code relies on
2443  // the proper adjustment of th_teams_size.nth after the fork in
2444  // __kmp_teams_master on each teams master in the case that
2445  // __kmp_reserve_threads reduced it.
2446  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2447  int old_num = master_th->th.th_team_nproc;
2448  int new_num = master_th->th.th_teams_size.nth;
2449  kmp_info_t **other_threads = team->t.t_threads;
2450  team->t.t_nproc = new_num;
2451  for (int i = 0; i < old_num; ++i) {
2452  other_threads[i]->th.th_team_nproc = new_num;
2453  }
2454  // Adjust states of non-used threads of the team
2455  for (int i = old_num; i < new_num; ++i) {
2456  // Re-initialize thread's barrier data.
2457  KMP_DEBUG_ASSERT(other_threads[i]);
2458  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2459  for (int b = 0; b < bs_last_barrier; ++b) {
2460  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2461  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2462 #if USE_DEBUGGER
2463  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2464 #endif
2465  }
2466  if (__kmp_tasking_mode != tskm_immediate_exec) {
2467  // Synchronize thread's task state
2468  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2469  }
2470  }
2471  }
2472 
2473 #if OMPT_SUPPORT
2474  if (ompt_enabled.enabled) {
2475  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2476  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2477  }
2478 #endif
2479 
2480  return;
2481  }
2482 
2483  /* do cleanup and restore the parent team */
2484  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2485  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2486 
2487  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2488 
2489  /* jc: The following lock has instructions with REL and ACQ semantics,
2490  separating the parallel user code called in this parallel region
2491  from the serial user code called after this function returns. */
2492  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2493 
2494  if (!master_th->th.th_teams_microtask ||
2495  team->t.t_level > master_th->th.th_teams_level) {
2496  /* Decrement our nested depth level */
2497  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2498  }
2499  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2500 
2501 #if OMPT_SUPPORT
2502  if (ompt_enabled.enabled) {
2503  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2504  if (ompt_enabled.ompt_callback_implicit_task) {
2505  int flags = (team_microtask == (void *)__kmp_teams_master)
2506  ? ompt_task_initial
2507  : ompt_task_implicit;
2508  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2509  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2510  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2511  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2512  }
2513  task_info->frame.exit_frame = ompt_data_none;
2514  task_info->task_data = ompt_data_none;
2515  }
2516 #endif
2517 
2518  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2519  master_th, team));
2520  __kmp_pop_current_task_from_thread(master_th);
2521 
2522 #if KMP_AFFINITY_SUPPORTED
2523  // Restore master thread's partition.
2524  master_th->th.th_first_place = team->t.t_first_place;
2525  master_th->th.th_last_place = team->t.t_last_place;
2526 #endif // KMP_AFFINITY_SUPPORTED
2527  master_th->th.th_def_allocator = team->t.t_def_allocator;
2528 
2529  updateHWFPControl(team);
2530 
2531  if (root->r.r_active != master_active)
2532  root->r.r_active = master_active;
2533 
2534  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2535  master_th)); // this will free worker threads
2536 
2537  /* this race was fun to find. make sure the following is in the critical
2538  region otherwise assertions may fail occasionally since the old team may be
2539  reallocated and the hierarchy appears inconsistent. it is actually safe to
2540  run and won't cause any bugs, but will cause those assertion failures. it's
2541  only one deref&assign so might as well put this in the critical region */
2542  master_th->th.th_team = parent_team;
2543  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2544  master_th->th.th_team_master = parent_team->t.t_threads[0];
2545  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2546 
2547  /* restore serialized team, if need be */
2548  if (parent_team->t.t_serialized &&
2549  parent_team != master_th->th.th_serial_team &&
2550  parent_team != root->r.r_root_team) {
2551  __kmp_free_team(root,
2552  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2553  master_th->th.th_serial_team = parent_team;
2554  }
2555 
2556  if (__kmp_tasking_mode != tskm_immediate_exec) {
2557  if (master_th->th.th_task_state_top >
2558  0) { // Restore task state from memo stack
2559  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2560  // Remember master's state if we re-use this nested hot team
2561  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2562  master_th->th.th_task_state;
2563  --master_th->th.th_task_state_top; // pop
2564  // Now restore state at this level
2565  master_th->th.th_task_state =
2566  master_th->th
2567  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2568  }
2569  // Copy the task team from the parent team to the master thread
2570  master_th->th.th_task_team =
2571  parent_team->t.t_task_team[master_th->th.th_task_state];
2572  KA_TRACE(20,
2573  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2574  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2575  parent_team));
2576  }
2577 
2578  // TODO: GEH - cannot do this assertion because root thread not set up as
2579  // executing
2580  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2581  master_th->th.th_current_task->td_flags.executing = 1;
2582 
2583  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2584 
2585 #if OMPT_SUPPORT
2586  int flags =
2587  OMPT_INVOKER(fork_context) |
2588  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2589  : ompt_parallel_team);
2590  if (ompt_enabled.enabled) {
2591  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2592  codeptr);
2593  }
2594 #endif
2595 
2596  KMP_MB();
2597  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2598 }
2599 
2600 /* Check whether we should push an internal control record onto the
2601  serial team stack. If so, do it. */
2602 void __kmp_save_internal_controls(kmp_info_t *thread) {
2603 
2604  if (thread->th.th_team != thread->th.th_serial_team) {
2605  return;
2606  }
2607  if (thread->th.th_team->t.t_serialized > 1) {
2608  int push = 0;
2609 
2610  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2611  push = 1;
2612  } else {
2613  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2614  thread->th.th_team->t.t_serialized) {
2615  push = 1;
2616  }
2617  }
2618  if (push) { /* push a record on the serial team's stack */
2619  kmp_internal_control_t *control =
2620  (kmp_internal_control_t *)__kmp_allocate(
2621  sizeof(kmp_internal_control_t));
2622 
2623  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2624 
2625  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2626 
2627  control->next = thread->th.th_team->t.t_control_stack_top;
2628  thread->th.th_team->t.t_control_stack_top = control;
2629  }
2630  }
2631 }
2632 
2633 /* Changes set_nproc */
2634 void __kmp_set_num_threads(int new_nth, int gtid) {
2635  kmp_info_t *thread;
2636  kmp_root_t *root;
2637 
2638  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2639  KMP_DEBUG_ASSERT(__kmp_init_serial);
2640 
2641  if (new_nth < 1)
2642  new_nth = 1;
2643  else if (new_nth > __kmp_max_nth)
2644  new_nth = __kmp_max_nth;
2645 
2646  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2647  thread = __kmp_threads[gtid];
2648  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2649  return; // nothing to do
2650 
2651  __kmp_save_internal_controls(thread);
2652 
2653  set__nproc(thread, new_nth);
2654 
2655  // If this omp_set_num_threads() call will cause the hot team size to be
2656  // reduced (in the absence of a num_threads clause), then reduce it now,
2657  // rather than waiting for the next parallel region.
2658  root = thread->th.th_root;
2659  if (__kmp_init_parallel && (!root->r.r_active) &&
2660  (root->r.r_hot_team->t.t_nproc > new_nth)
2661 #if KMP_NESTED_HOT_TEAMS
2662  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2663 #endif
2664  ) {
2665  kmp_team_t *hot_team = root->r.r_hot_team;
2666  int f;
2667 
2668  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2669 
2670  // Release the extra threads we don't need any more.
2671  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2672  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2673  if (__kmp_tasking_mode != tskm_immediate_exec) {
2674  // When decreasing team size, threads no longer in the team should unref
2675  // task team.
2676  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2677  }
2678  __kmp_free_thread(hot_team->t.t_threads[f]);
2679  hot_team->t.t_threads[f] = NULL;
2680  }
2681  hot_team->t.t_nproc = new_nth;
2682 #if KMP_NESTED_HOT_TEAMS
2683  if (thread->th.th_hot_teams) {
2684  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2685  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2686  }
2687 #endif
2688 
2689  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2690 
2691  // Update the t_nproc field in the threads that are still active.
2692  for (f = 0; f < new_nth; f++) {
2693  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2694  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2695  }
2696  // Special flag in case omp_set_num_threads() call
2697  hot_team->t.t_size_changed = -1;
2698  }
2699 }
2700 
2701 /* Changes max_active_levels */
2702 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2703  kmp_info_t *thread;
2704 
2705  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2706  "%d = (%d)\n",
2707  gtid, max_active_levels));
2708  KMP_DEBUG_ASSERT(__kmp_init_serial);
2709 
2710  // validate max_active_levels
2711  if (max_active_levels < 0) {
2712  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2713  // We ignore this call if the user has specified a negative value.
2714  // The current setting won't be changed. The last valid setting will be
2715  // used. A warning will be issued (if warnings are allowed as controlled by
2716  // the KMP_WARNINGS env var).
2717  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2718  "max_active_levels for thread %d = (%d)\n",
2719  gtid, max_active_levels));
2720  return;
2721  }
2722  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2723  // it's OK, the max_active_levels is within the valid range: [ 0;
2724  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2725  // We allow a zero value. (implementation defined behavior)
2726  } else {
2727  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2728  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2729  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2730  // Current upper limit is MAX_INT. (implementation defined behavior)
2731  // If the input exceeds the upper limit, we correct the input to be the
2732  // upper limit. (implementation defined behavior)
2733  // Actually, the flow should never get here until we use MAX_INT limit.
2734  }
2735  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2736  "max_active_levels for thread %d = (%d)\n",
2737  gtid, max_active_levels));
2738 
2739  thread = __kmp_threads[gtid];
2740 
2741  __kmp_save_internal_controls(thread);
2742 
2743  set__max_active_levels(thread, max_active_levels);
2744 }
2745 
2746 /* Gets max_active_levels */
2747 int __kmp_get_max_active_levels(int gtid) {
2748  kmp_info_t *thread;
2749 
2750  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2751  KMP_DEBUG_ASSERT(__kmp_init_serial);
2752 
2753  thread = __kmp_threads[gtid];
2754  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2755  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2756  "curtask_maxaclevel=%d\n",
2757  gtid, thread->th.th_current_task,
2758  thread->th.th_current_task->td_icvs.max_active_levels));
2759  return thread->th.th_current_task->td_icvs.max_active_levels;
2760 }
2761 
2762 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2763 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2764 
2765 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2766 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2767  kmp_info_t *thread;
2768  kmp_sched_t orig_kind;
2769  // kmp_team_t *team;
2770 
2771  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2772  gtid, (int)kind, chunk));
2773  KMP_DEBUG_ASSERT(__kmp_init_serial);
2774 
2775  // Check if the kind parameter is valid, correct if needed.
2776  // Valid parameters should fit in one of two intervals - standard or extended:
2777  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2778  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2779  orig_kind = kind;
2780  kind = __kmp_sched_without_mods(kind);
2781 
2782  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2783  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2784  // TODO: Hint needs attention in case we change the default schedule.
2785  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2786  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2787  __kmp_msg_null);
2788  kind = kmp_sched_default;
2789  chunk = 0; // ignore chunk value in case of bad kind
2790  }
2791 
2792  thread = __kmp_threads[gtid];
2793 
2794  __kmp_save_internal_controls(thread);
2795 
2796  if (kind < kmp_sched_upper_std) {
2797  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2798  // differ static chunked vs. unchunked: chunk should be invalid to
2799  // indicate unchunked schedule (which is the default)
2800  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2801  } else {
2802  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2803  __kmp_sch_map[kind - kmp_sched_lower - 1];
2804  }
2805  } else {
2806  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2807  // kmp_sched_lower - 2 ];
2808  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2809  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2810  kmp_sched_lower - 2];
2811  }
2812  __kmp_sched_apply_mods_intkind(
2813  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2814  if (kind == kmp_sched_auto || chunk < 1) {
2815  // ignore parameter chunk for schedule auto
2816  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2817  } else {
2818  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2819  }
2820 }
2821 
2822 /* Gets def_sched_var ICV values */
2823 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2824  kmp_info_t *thread;
2825  enum sched_type th_type;
2826 
2827  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2828  KMP_DEBUG_ASSERT(__kmp_init_serial);
2829 
2830  thread = __kmp_threads[gtid];
2831 
2832  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2833  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2834  case kmp_sch_static:
2835  case kmp_sch_static_greedy:
2836  case kmp_sch_static_balanced:
2837  *kind = kmp_sched_static;
2838  __kmp_sched_apply_mods_stdkind(kind, th_type);
2839  *chunk = 0; // chunk was not set, try to show this fact via zero value
2840  return;
2841  case kmp_sch_static_chunked:
2842  *kind = kmp_sched_static;
2843  break;
2844  case kmp_sch_dynamic_chunked:
2845  *kind = kmp_sched_dynamic;
2846  break;
2848  case kmp_sch_guided_iterative_chunked:
2849  case kmp_sch_guided_analytical_chunked:
2850  *kind = kmp_sched_guided;
2851  break;
2852  case kmp_sch_auto:
2853  *kind = kmp_sched_auto;
2854  break;
2855  case kmp_sch_trapezoidal:
2856  *kind = kmp_sched_trapezoidal;
2857  break;
2858 #if KMP_STATIC_STEAL_ENABLED
2859  case kmp_sch_static_steal:
2860  *kind = kmp_sched_static_steal;
2861  break;
2862 #endif
2863  default:
2864  KMP_FATAL(UnknownSchedulingType, th_type);
2865  }
2866 
2867  __kmp_sched_apply_mods_stdkind(kind, th_type);
2868  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2869 }
2870 
2871 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2872 
2873  int ii, dd;
2874  kmp_team_t *team;
2875  kmp_info_t *thr;
2876 
2877  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2878  KMP_DEBUG_ASSERT(__kmp_init_serial);
2879 
2880  // validate level
2881  if (level == 0)
2882  return 0;
2883  if (level < 0)
2884  return -1;
2885  thr = __kmp_threads[gtid];
2886  team = thr->th.th_team;
2887  ii = team->t.t_level;
2888  if (level > ii)
2889  return -1;
2890 
2891  if (thr->th.th_teams_microtask) {
2892  // AC: we are in teams region where multiple nested teams have same level
2893  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2894  if (level <=
2895  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2896  KMP_DEBUG_ASSERT(ii >= tlevel);
2897  // AC: As we need to pass by the teams league, we need to artificially
2898  // increase ii
2899  if (ii == tlevel) {
2900  ii += 2; // three teams have same level
2901  } else {
2902  ii++; // two teams have same level
2903  }
2904  }
2905  }
2906 
2907  if (ii == level)
2908  return __kmp_tid_from_gtid(gtid);
2909 
2910  dd = team->t.t_serialized;
2911  level++;
2912  while (ii > level) {
2913  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2914  }
2915  if ((team->t.t_serialized) && (!dd)) {
2916  team = team->t.t_parent;
2917  continue;
2918  }
2919  if (ii > level) {
2920  team = team->t.t_parent;
2921  dd = team->t.t_serialized;
2922  ii--;
2923  }
2924  }
2925 
2926  return (dd > 1) ? (0) : (team->t.t_master_tid);
2927 }
2928 
2929 int __kmp_get_team_size(int gtid, int level) {
2930 
2931  int ii, dd;
2932  kmp_team_t *team;
2933  kmp_info_t *thr;
2934 
2935  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2936  KMP_DEBUG_ASSERT(__kmp_init_serial);
2937 
2938  // validate level
2939  if (level == 0)
2940  return 1;
2941  if (level < 0)
2942  return -1;
2943  thr = __kmp_threads[gtid];
2944  team = thr->th.th_team;
2945  ii = team->t.t_level;
2946  if (level > ii)
2947  return -1;
2948 
2949  if (thr->th.th_teams_microtask) {
2950  // AC: we are in teams region where multiple nested teams have same level
2951  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2952  if (level <=
2953  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2954  KMP_DEBUG_ASSERT(ii >= tlevel);
2955  // AC: As we need to pass by the teams league, we need to artificially
2956  // increase ii
2957  if (ii == tlevel) {
2958  ii += 2; // three teams have same level
2959  } else {
2960  ii++; // two teams have same level
2961  }
2962  }
2963  }
2964 
2965  while (ii > level) {
2966  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2967  }
2968  if (team->t.t_serialized && (!dd)) {
2969  team = team->t.t_parent;
2970  continue;
2971  }
2972  if (ii > level) {
2973  team = team->t.t_parent;
2974  ii--;
2975  }
2976  }
2977 
2978  return team->t.t_nproc;
2979 }
2980 
2981 kmp_r_sched_t __kmp_get_schedule_global() {
2982  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2983  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2984  // independently. So one can get the updated schedule here.
2985 
2986  kmp_r_sched_t r_sched;
2987 
2988  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2989  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2990  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2991  // different roots (even in OMP 2.5)
2992  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2993  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2994  if (s == kmp_sch_static) {
2995  // replace STATIC with more detailed schedule (balanced or greedy)
2996  r_sched.r_sched_type = __kmp_static;
2997  } else if (s == kmp_sch_guided_chunked) {
2998  // replace GUIDED with more detailed schedule (iterative or analytical)
2999  r_sched.r_sched_type = __kmp_guided;
3000  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3001  r_sched.r_sched_type = __kmp_sched;
3002  }
3003  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3004 
3005  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3006  // __kmp_chunk may be wrong here (if it was not ever set)
3007  r_sched.chunk = KMP_DEFAULT_CHUNK;
3008  } else {
3009  r_sched.chunk = __kmp_chunk;
3010  }
3011 
3012  return r_sched;
3013 }
3014 
3015 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3016  at least argc number of *t_argv entries for the requested team. */
3017 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3018 
3019  KMP_DEBUG_ASSERT(team);
3020  if (!realloc || argc > team->t.t_max_argc) {
3021 
3022  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3023  "current entries=%d\n",
3024  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3025  /* if previously allocated heap space for args, free them */
3026  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3027  __kmp_free((void *)team->t.t_argv);
3028 
3029  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3030  /* use unused space in the cache line for arguments */
3031  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3032  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3033  "argv entries\n",
3034  team->t.t_id, team->t.t_max_argc));
3035  team->t.t_argv = &team->t.t_inline_argv[0];
3036  if (__kmp_storage_map) {
3037  __kmp_print_storage_map_gtid(
3038  -1, &team->t.t_inline_argv[0],
3039  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3040  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3041  team->t.t_id);
3042  }
3043  } else {
3044  /* allocate space for arguments in the heap */
3045  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3046  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3047  : 2 * argc;
3048  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3049  "argv entries\n",
3050  team->t.t_id, team->t.t_max_argc));
3051  team->t.t_argv =
3052  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3053  if (__kmp_storage_map) {
3054  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3055  &team->t.t_argv[team->t.t_max_argc],
3056  sizeof(void *) * team->t.t_max_argc,
3057  "team_%d.t_argv", team->t.t_id);
3058  }
3059  }
3060  }
3061 }
3062 
3063 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3064  int i;
3065  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3066  team->t.t_threads =
3067  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3068  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3069  sizeof(dispatch_shared_info_t) * num_disp_buff);
3070  team->t.t_dispatch =
3071  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3072  team->t.t_implicit_task_taskdata =
3073  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3074  team->t.t_max_nproc = max_nth;
3075 
3076  /* setup dispatch buffers */
3077  for (i = 0; i < num_disp_buff; ++i) {
3078  team->t.t_disp_buffer[i].buffer_index = i;
3079  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3080  }
3081 }
3082 
3083 static void __kmp_free_team_arrays(kmp_team_t *team) {
3084  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3085  int i;
3086  for (i = 0; i < team->t.t_max_nproc; ++i) {
3087  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3088  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3089  team->t.t_dispatch[i].th_disp_buffer = NULL;
3090  }
3091  }
3092 #if KMP_USE_HIER_SCHED
3093  __kmp_dispatch_free_hierarchies(team);
3094 #endif
3095  __kmp_free(team->t.t_threads);
3096  __kmp_free(team->t.t_disp_buffer);
3097  __kmp_free(team->t.t_dispatch);
3098  __kmp_free(team->t.t_implicit_task_taskdata);
3099  team->t.t_threads = NULL;
3100  team->t.t_disp_buffer = NULL;
3101  team->t.t_dispatch = NULL;
3102  team->t.t_implicit_task_taskdata = 0;
3103 }
3104 
3105 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3106  kmp_info_t **oldThreads = team->t.t_threads;
3107 
3108  __kmp_free(team->t.t_disp_buffer);
3109  __kmp_free(team->t.t_dispatch);
3110  __kmp_free(team->t.t_implicit_task_taskdata);
3111  __kmp_allocate_team_arrays(team, max_nth);
3112 
3113  KMP_MEMCPY(team->t.t_threads, oldThreads,
3114  team->t.t_nproc * sizeof(kmp_info_t *));
3115 
3116  __kmp_free(oldThreads);
3117 }
3118 
3119 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3120 
3121  kmp_r_sched_t r_sched =
3122  __kmp_get_schedule_global(); // get current state of scheduling globals
3123 
3124  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3125 
3126  kmp_internal_control_t g_icvs = {
3127  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3128  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3129  // adjustment of threads (per thread)
3130  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3131  // whether blocktime is explicitly set
3132  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3133 #if KMP_USE_MONITOR
3134  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3135 // intervals
3136 #endif
3137  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3138  // next parallel region (per thread)
3139  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3140  __kmp_cg_max_nth, // int thread_limit;
3141  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3142  // for max_active_levels
3143  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3144  // {sched,chunk} pair
3145  __kmp_nested_proc_bind.bind_types[0],
3146  __kmp_default_device,
3147  NULL // struct kmp_internal_control *next;
3148  };
3149 
3150  return g_icvs;
3151 }
3152 
3153 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3154 
3155  kmp_internal_control_t gx_icvs;
3156  gx_icvs.serial_nesting_level =
3157  0; // probably =team->t.t_serial like in save_inter_controls
3158  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3159  gx_icvs.next = NULL;
3160 
3161  return gx_icvs;
3162 }
3163 
3164 static void __kmp_initialize_root(kmp_root_t *root) {
3165  int f;
3166  kmp_team_t *root_team;
3167  kmp_team_t *hot_team;
3168  int hot_team_max_nth;
3169  kmp_r_sched_t r_sched =
3170  __kmp_get_schedule_global(); // get current state of scheduling globals
3171  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3172  KMP_DEBUG_ASSERT(root);
3173  KMP_ASSERT(!root->r.r_begin);
3174 
3175  /* setup the root state structure */
3176  __kmp_init_lock(&root->r.r_begin_lock);
3177  root->r.r_begin = FALSE;
3178  root->r.r_active = FALSE;
3179  root->r.r_in_parallel = 0;
3180  root->r.r_blocktime = __kmp_dflt_blocktime;
3181 
3182  /* setup the root team for this task */
3183  /* allocate the root team structure */
3184  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3185 
3186  root_team =
3187  __kmp_allocate_team(root,
3188  1, // new_nproc
3189  1, // max_nproc
3190 #if OMPT_SUPPORT
3191  ompt_data_none, // root parallel id
3192 #endif
3193  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3194  0 // argc
3195  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3196  );
3197 #if USE_DEBUGGER
3198  // Non-NULL value should be assigned to make the debugger display the root
3199  // team.
3200  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3201 #endif
3202 
3203  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3204 
3205  root->r.r_root_team = root_team;
3206  root_team->t.t_control_stack_top = NULL;
3207 
3208  /* initialize root team */
3209  root_team->t.t_threads[0] = NULL;
3210  root_team->t.t_nproc = 1;
3211  root_team->t.t_serialized = 1;
3212  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3213  root_team->t.t_sched.sched = r_sched.sched;
3214  KA_TRACE(
3215  20,
3216  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3217  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3218 
3219  /* setup the hot team for this task */
3220  /* allocate the hot team structure */
3221  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3222 
3223  hot_team =
3224  __kmp_allocate_team(root,
3225  1, // new_nproc
3226  __kmp_dflt_team_nth_ub * 2, // max_nproc
3227 #if OMPT_SUPPORT
3228  ompt_data_none, // root parallel id
3229 #endif
3230  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3231  0 // argc
3232  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3233  );
3234  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3235 
3236  root->r.r_hot_team = hot_team;
3237  root_team->t.t_control_stack_top = NULL;
3238 
3239  /* first-time initialization */
3240  hot_team->t.t_parent = root_team;
3241 
3242  /* initialize hot team */
3243  hot_team_max_nth = hot_team->t.t_max_nproc;
3244  for (f = 0; f < hot_team_max_nth; ++f) {
3245  hot_team->t.t_threads[f] = NULL;
3246  }
3247  hot_team->t.t_nproc = 1;
3248  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3249  hot_team->t.t_sched.sched = r_sched.sched;
3250  hot_team->t.t_size_changed = 0;
3251 }
3252 
3253 #ifdef KMP_DEBUG
3254 
3255 typedef struct kmp_team_list_item {
3256  kmp_team_p const *entry;
3257  struct kmp_team_list_item *next;
3258 } kmp_team_list_item_t;
3259 typedef kmp_team_list_item_t *kmp_team_list_t;
3260 
3261 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3262  kmp_team_list_t list, // List of teams.
3263  kmp_team_p const *team // Team to add.
3264  ) {
3265 
3266  // List must terminate with item where both entry and next are NULL.
3267  // Team is added to the list only once.
3268  // List is sorted in ascending order by team id.
3269  // Team id is *not* a key.
3270 
3271  kmp_team_list_t l;
3272 
3273  KMP_DEBUG_ASSERT(list != NULL);
3274  if (team == NULL) {
3275  return;
3276  }
3277 
3278  __kmp_print_structure_team_accum(list, team->t.t_parent);
3279  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3280 
3281  // Search list for the team.
3282  l = list;
3283  while (l->next != NULL && l->entry != team) {
3284  l = l->next;
3285  }
3286  if (l->next != NULL) {
3287  return; // Team has been added before, exit.
3288  }
3289 
3290  // Team is not found. Search list again for insertion point.
3291  l = list;
3292  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3293  l = l->next;
3294  }
3295 
3296  // Insert team.
3297  {
3298  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3299  sizeof(kmp_team_list_item_t));
3300  *item = *l;
3301  l->entry = team;
3302  l->next = item;
3303  }
3304 }
3305 
3306 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3307 
3308  ) {
3309  __kmp_printf("%s", title);
3310  if (team != NULL) {
3311  __kmp_printf("%2x %p\n", team->t.t_id, team);
3312  } else {
3313  __kmp_printf(" - (nil)\n");
3314  }
3315 }
3316 
3317 static void __kmp_print_structure_thread(char const *title,
3318  kmp_info_p const *thread) {
3319  __kmp_printf("%s", title);
3320  if (thread != NULL) {
3321  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3322  } else {
3323  __kmp_printf(" - (nil)\n");
3324  }
3325 }
3326 
3327 void __kmp_print_structure(void) {
3328 
3329  kmp_team_list_t list;
3330 
3331  // Initialize list of teams.
3332  list =
3333  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3334  list->entry = NULL;
3335  list->next = NULL;
3336 
3337  __kmp_printf("\n------------------------------\nGlobal Thread "
3338  "Table\n------------------------------\n");
3339  {
3340  int gtid;
3341  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3342  __kmp_printf("%2d", gtid);
3343  if (__kmp_threads != NULL) {
3344  __kmp_printf(" %p", __kmp_threads[gtid]);
3345  }
3346  if (__kmp_root != NULL) {
3347  __kmp_printf(" %p", __kmp_root[gtid]);
3348  }
3349  __kmp_printf("\n");
3350  }
3351  }
3352 
3353  // Print out __kmp_threads array.
3354  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3355  "----------\n");
3356  if (__kmp_threads != NULL) {
3357  int gtid;
3358  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3359  kmp_info_t const *thread = __kmp_threads[gtid];
3360  if (thread != NULL) {
3361  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3362  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3363  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3364  __kmp_print_structure_team(" Serial Team: ",
3365  thread->th.th_serial_team);
3366  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3367  __kmp_print_structure_thread(" Master: ",
3368  thread->th.th_team_master);
3369  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3370  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3371  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3372  __kmp_print_structure_thread(" Next in pool: ",
3373  thread->th.th_next_pool);
3374  __kmp_printf("\n");
3375  __kmp_print_structure_team_accum(list, thread->th.th_team);
3376  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3377  }
3378  }
3379  } else {
3380  __kmp_printf("Threads array is not allocated.\n");
3381  }
3382 
3383  // Print out __kmp_root array.
3384  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3385  "--------\n");
3386  if (__kmp_root != NULL) {
3387  int gtid;
3388  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3389  kmp_root_t const *root = __kmp_root[gtid];
3390  if (root != NULL) {
3391  __kmp_printf("GTID %2d %p:\n", gtid, root);
3392  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3393  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3394  __kmp_print_structure_thread(" Uber Thread: ",
3395  root->r.r_uber_thread);
3396  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3397  __kmp_printf(" In Parallel: %2d\n",
3398  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3399  __kmp_printf("\n");
3400  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3401  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3402  }
3403  }
3404  } else {
3405  __kmp_printf("Ubers array is not allocated.\n");
3406  }
3407 
3408  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3409  "--------\n");
3410  while (list->next != NULL) {
3411  kmp_team_p const *team = list->entry;
3412  int i;
3413  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3414  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3415  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3416  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3417  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3418  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3419  for (i = 0; i < team->t.t_nproc; ++i) {
3420  __kmp_printf(" Thread %2d: ", i);
3421  __kmp_print_structure_thread("", team->t.t_threads[i]);
3422  }
3423  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3424  __kmp_printf("\n");
3425  list = list->next;
3426  }
3427 
3428  // Print out __kmp_thread_pool and __kmp_team_pool.
3429  __kmp_printf("\n------------------------------\nPools\n----------------------"
3430  "--------\n");
3431  __kmp_print_structure_thread("Thread pool: ",
3432  CCAST(kmp_info_t *, __kmp_thread_pool));
3433  __kmp_print_structure_team("Team pool: ",
3434  CCAST(kmp_team_t *, __kmp_team_pool));
3435  __kmp_printf("\n");
3436 
3437  // Free team list.
3438  while (list != NULL) {
3439  kmp_team_list_item_t *item = list;
3440  list = list->next;
3441  KMP_INTERNAL_FREE(item);
3442  }
3443 }
3444 
3445 #endif
3446 
3447 //---------------------------------------------------------------------------
3448 // Stuff for per-thread fast random number generator
3449 // Table of primes
3450 static const unsigned __kmp_primes[] = {
3451  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3452  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3453  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3454  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3455  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3456  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3457  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3458  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3459  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3460  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3461  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3462 
3463 //---------------------------------------------------------------------------
3464 // __kmp_get_random: Get a random number using a linear congruential method.
3465 unsigned short __kmp_get_random(kmp_info_t *thread) {
3466  unsigned x = thread->th.th_x;
3467  unsigned short r = (unsigned short)(x >> 16);
3468 
3469  thread->th.th_x = x * thread->th.th_a + 1;
3470 
3471  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3472  thread->th.th_info.ds.ds_tid, r));
3473 
3474  return r;
3475 }
3476 //--------------------------------------------------------
3477 // __kmp_init_random: Initialize a random number generator
3478 void __kmp_init_random(kmp_info_t *thread) {
3479  unsigned seed = thread->th.th_info.ds.ds_tid;
3480 
3481  thread->th.th_a =
3482  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3483  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3484  KA_TRACE(30,
3485  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3486 }
3487 
3488 #if KMP_OS_WINDOWS
3489 /* reclaim array entries for root threads that are already dead, returns number
3490  * reclaimed */
3491 static int __kmp_reclaim_dead_roots(void) {
3492  int i, r = 0;
3493 
3494  for (i = 0; i < __kmp_threads_capacity; ++i) {
3495  if (KMP_UBER_GTID(i) &&
3496  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3497  !__kmp_root[i]
3498  ->r.r_active) { // AC: reclaim only roots died in non-active state
3499  r += __kmp_unregister_root_other_thread(i);
3500  }
3501  }
3502  return r;
3503 }
3504 #endif
3505 
3506 /* This function attempts to create free entries in __kmp_threads and
3507  __kmp_root, and returns the number of free entries generated.
3508 
3509  For Windows* OS static library, the first mechanism used is to reclaim array
3510  entries for root threads that are already dead.
3511 
3512  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3513  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3514  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3515  threadprivate cache array has been created. Synchronization with
3516  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3517 
3518  After any dead root reclamation, if the clipping value allows array expansion
3519  to result in the generation of a total of nNeed free slots, the function does
3520  that expansion. If not, nothing is done beyond the possible initial root
3521  thread reclamation.
3522 
3523  If any argument is negative, the behavior is undefined. */
3524 static int __kmp_expand_threads(int nNeed) {
3525  int added = 0;
3526  int minimumRequiredCapacity;
3527  int newCapacity;
3528  kmp_info_t **newThreads;
3529  kmp_root_t **newRoot;
3530 
3531 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3532 // resizing __kmp_threads does not need additional protection if foreign
3533 // threads are present
3534 
3535 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3536  /* only for Windows static library */
3537  /* reclaim array entries for root threads that are already dead */
3538  added = __kmp_reclaim_dead_roots();
3539 
3540  if (nNeed) {
3541  nNeed -= added;
3542  if (nNeed < 0)
3543  nNeed = 0;
3544  }
3545 #endif
3546  if (nNeed <= 0)
3547  return added;
3548 
3549  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3550  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3551  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3552  // > __kmp_max_nth in one of two ways:
3553  //
3554  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3555  // may not be reused by another thread, so we may need to increase
3556  // __kmp_threads_capacity to __kmp_max_nth + 1.
3557  //
3558  // 2) New foreign root(s) are encountered. We always register new foreign
3559  // roots. This may cause a smaller # of threads to be allocated at
3560  // subsequent parallel regions, but the worker threads hang around (and
3561  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3562  //
3563  // Anyway, that is the reason for moving the check to see if
3564  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3565  // instead of having it performed here. -BB
3566 
3567  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3568 
3569  /* compute expansion headroom to check if we can expand */
3570  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3571  /* possible expansion too small -- give up */
3572  return added;
3573  }
3574  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3575 
3576  newCapacity = __kmp_threads_capacity;
3577  do {
3578  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3579  : __kmp_sys_max_nth;
3580  } while (newCapacity < minimumRequiredCapacity);
3581  newThreads = (kmp_info_t **)__kmp_allocate(
3582  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3583  newRoot =
3584  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3585  KMP_MEMCPY(newThreads, __kmp_threads,
3586  __kmp_threads_capacity * sizeof(kmp_info_t *));
3587  KMP_MEMCPY(newRoot, __kmp_root,
3588  __kmp_threads_capacity * sizeof(kmp_root_t *));
3589 
3590  kmp_info_t **temp_threads = __kmp_threads;
3591  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3592  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3593  __kmp_free(temp_threads);
3594  added += newCapacity - __kmp_threads_capacity;
3595  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3596 
3597  if (newCapacity > __kmp_tp_capacity) {
3598  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3599  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3600  __kmp_threadprivate_resize_cache(newCapacity);
3601  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3602  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3603  }
3604  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3605  }
3606 
3607  return added;
3608 }
3609 
3610 /* Register the current thread as a root thread and obtain our gtid. We must
3611  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3612  thread that calls from __kmp_do_serial_initialize() */
3613 int __kmp_register_root(int initial_thread) {
3614  kmp_info_t *root_thread;
3615  kmp_root_t *root;
3616  int gtid;
3617  int capacity;
3618  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3619  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3620  KMP_MB();
3621 
3622  /* 2007-03-02:
3623  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3624  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3625  work as expected -- it may return false (that means there is at least one
3626  empty slot in __kmp_threads array), but it is possible the only free slot
3627  is #0, which is reserved for initial thread and so cannot be used for this
3628  one. Following code workarounds this bug.
3629 
3630  However, right solution seems to be not reserving slot #0 for initial
3631  thread because:
3632  (1) there is no magic in slot #0,
3633  (2) we cannot detect initial thread reliably (the first thread which does
3634  serial initialization may be not a real initial thread).
3635  */
3636  capacity = __kmp_threads_capacity;
3637  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3638  --capacity;
3639  }
3640 
3641  // If it is not for initializing the hidden helper team, we need to take
3642  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3643  // in __kmp_threads_capacity.
3644  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3645  capacity -= __kmp_hidden_helper_threads_num;
3646  }
3647 
3648  /* see if there are too many threads */
3649  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3650  if (__kmp_tp_cached) {
3651  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3652  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3653  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3654  } else {
3655  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3656  __kmp_msg_null);
3657  }
3658  }
3659 
3660  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3661  // 0: initial thread, also a regular OpenMP thread.
3662  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3663  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3664  // regular OpenMP threads.
3665  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3666  // Find an available thread slot for hidden helper thread. Slots for hidden
3667  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3668  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3669  gtid <= __kmp_hidden_helper_threads_num;
3670  gtid++)
3671  ;
3672  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3673  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3674  "hidden helper thread: T#%d\n",
3675  gtid));
3676  } else {
3677  /* find an available thread slot */
3678  // Don't reassign the zero slot since we need that to only be used by
3679  // initial thread. Slots for hidden helper threads should also be skipped.
3680  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3681  gtid = 0;
3682  } else {
3683  for (gtid = __kmp_hidden_helper_threads_num + 1;
3684  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3685  ;
3686  }
3687  KA_TRACE(
3688  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3689  KMP_ASSERT(gtid < __kmp_threads_capacity);
3690  }
3691 
3692  /* update global accounting */
3693  __kmp_all_nth++;
3694  TCW_4(__kmp_nth, __kmp_nth + 1);
3695 
3696  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3697  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3698  if (__kmp_adjust_gtid_mode) {
3699  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3700  if (TCR_4(__kmp_gtid_mode) != 2) {
3701  TCW_4(__kmp_gtid_mode, 2);
3702  }
3703  } else {
3704  if (TCR_4(__kmp_gtid_mode) != 1) {
3705  TCW_4(__kmp_gtid_mode, 1);
3706  }
3707  }
3708  }
3709 
3710 #ifdef KMP_ADJUST_BLOCKTIME
3711  /* Adjust blocktime to zero if necessary */
3712  /* Middle initialization might not have occurred yet */
3713  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3714  if (__kmp_nth > __kmp_avail_proc) {
3715  __kmp_zero_bt = TRUE;
3716  }
3717  }
3718 #endif /* KMP_ADJUST_BLOCKTIME */
3719 
3720  /* setup this new hierarchy */
3721  if (!(root = __kmp_root[gtid])) {
3722  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3723  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3724  }
3725 
3726 #if KMP_STATS_ENABLED
3727  // Initialize stats as soon as possible (right after gtid assignment).
3728  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3729  __kmp_stats_thread_ptr->startLife();
3730  KMP_SET_THREAD_STATE(SERIAL_REGION);
3731  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3732 #endif
3733  __kmp_initialize_root(root);
3734 
3735  /* setup new root thread structure */
3736  if (root->r.r_uber_thread) {
3737  root_thread = root->r.r_uber_thread;
3738  } else {
3739  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3740  if (__kmp_storage_map) {
3741  __kmp_print_thread_storage_map(root_thread, gtid);
3742  }
3743  root_thread->th.th_info.ds.ds_gtid = gtid;
3744 #if OMPT_SUPPORT
3745  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3746 #endif
3747  root_thread->th.th_root = root;
3748  if (__kmp_env_consistency_check) {
3749  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3750  }
3751 #if USE_FAST_MEMORY
3752  __kmp_initialize_fast_memory(root_thread);
3753 #endif /* USE_FAST_MEMORY */
3754 
3755 #if KMP_USE_BGET
3756  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3757  __kmp_initialize_bget(root_thread);
3758 #endif
3759  __kmp_init_random(root_thread); // Initialize random number generator
3760  }
3761 
3762  /* setup the serial team held in reserve by the root thread */
3763  if (!root_thread->th.th_serial_team) {
3764  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3765  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3766  root_thread->th.th_serial_team = __kmp_allocate_team(
3767  root, 1, 1,
3768 #if OMPT_SUPPORT
3769  ompt_data_none, // root parallel id
3770 #endif
3771  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3772  }
3773  KMP_ASSERT(root_thread->th.th_serial_team);
3774  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3775  root_thread->th.th_serial_team));
3776 
3777  /* drop root_thread into place */
3778  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3779 
3780  root->r.r_root_team->t.t_threads[0] = root_thread;
3781  root->r.r_hot_team->t.t_threads[0] = root_thread;
3782  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3783  // AC: the team created in reserve, not for execution (it is unused for now).
3784  root_thread->th.th_serial_team->t.t_serialized = 0;
3785  root->r.r_uber_thread = root_thread;
3786 
3787  /* initialize the thread, get it ready to go */
3788  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3789  TCW_4(__kmp_init_gtid, TRUE);
3790 
3791  /* prepare the master thread for get_gtid() */
3792  __kmp_gtid_set_specific(gtid);
3793 
3794 #if USE_ITT_BUILD
3795  __kmp_itt_thread_name(gtid);
3796 #endif /* USE_ITT_BUILD */
3797 
3798 #ifdef KMP_TDATA_GTID
3799  __kmp_gtid = gtid;
3800 #endif
3801  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3802  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3803 
3804  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3805  "plain=%u\n",
3806  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3807  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3808  KMP_INIT_BARRIER_STATE));
3809  { // Initialize barrier data.
3810  int b;
3811  for (b = 0; b < bs_last_barrier; ++b) {
3812  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3813 #if USE_DEBUGGER
3814  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3815 #endif
3816  }
3817  }
3818  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3819  KMP_INIT_BARRIER_STATE);
3820 
3821 #if KMP_AFFINITY_SUPPORTED
3822  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3823  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3824  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3825  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3826  if (TCR_4(__kmp_init_middle)) {
3827  __kmp_affinity_set_init_mask(gtid, TRUE);
3828  }
3829 #endif /* KMP_AFFINITY_SUPPORTED */
3830  root_thread->th.th_def_allocator = __kmp_def_allocator;
3831  root_thread->th.th_prev_level = 0;
3832  root_thread->th.th_prev_num_threads = 1;
3833 
3834  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3835  tmp->cg_root = root_thread;
3836  tmp->cg_thread_limit = __kmp_cg_max_nth;
3837  tmp->cg_nthreads = 1;
3838  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3839  " cg_nthreads init to 1\n",
3840  root_thread, tmp));
3841  tmp->up = NULL;
3842  root_thread->th.th_cg_roots = tmp;
3843 
3844  __kmp_root_counter++;
3845 
3846 #if OMPT_SUPPORT
3847  if (!initial_thread && ompt_enabled.enabled) {
3848 
3849  kmp_info_t *root_thread = ompt_get_thread();
3850 
3851  ompt_set_thread_state(root_thread, ompt_state_overhead);
3852 
3853  if (ompt_enabled.ompt_callback_thread_begin) {
3854  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3855  ompt_thread_initial, __ompt_get_thread_data_internal());
3856  }
3857  ompt_data_t *task_data;
3858  ompt_data_t *parallel_data;
3859  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3860  if (ompt_enabled.ompt_callback_implicit_task) {
3861  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3862  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3863  }
3864 
3865  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3866  }
3867 #endif
3868 
3869  KMP_MB();
3870  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3871 
3872  return gtid;
3873 }
3874 
3875 #if KMP_NESTED_HOT_TEAMS
3876 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3877  const int max_level) {
3878  int i, n, nth;
3879  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3880  if (!hot_teams || !hot_teams[level].hot_team) {
3881  return 0;
3882  }
3883  KMP_DEBUG_ASSERT(level < max_level);
3884  kmp_team_t *team = hot_teams[level].hot_team;
3885  nth = hot_teams[level].hot_team_nth;
3886  n = nth - 1; // master is not freed
3887  if (level < max_level - 1) {
3888  for (i = 0; i < nth; ++i) {
3889  kmp_info_t *th = team->t.t_threads[i];
3890  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3891  if (i > 0 && th->th.th_hot_teams) {
3892  __kmp_free(th->th.th_hot_teams);
3893  th->th.th_hot_teams = NULL;
3894  }
3895  }
3896  }
3897  __kmp_free_team(root, team, NULL);
3898  return n;
3899 }
3900 #endif
3901 
3902 // Resets a root thread and clear its root and hot teams.
3903 // Returns the number of __kmp_threads entries directly and indirectly freed.
3904 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3905  kmp_team_t *root_team = root->r.r_root_team;
3906  kmp_team_t *hot_team = root->r.r_hot_team;
3907  int n = hot_team->t.t_nproc;
3908  int i;
3909 
3910  KMP_DEBUG_ASSERT(!root->r.r_active);
3911 
3912  root->r.r_root_team = NULL;
3913  root->r.r_hot_team = NULL;
3914  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3915  // before call to __kmp_free_team().
3916  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3917 #if KMP_NESTED_HOT_TEAMS
3918  if (__kmp_hot_teams_max_level >
3919  0) { // need to free nested hot teams and their threads if any
3920  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3921  kmp_info_t *th = hot_team->t.t_threads[i];
3922  if (__kmp_hot_teams_max_level > 1) {
3923  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3924  }
3925  if (th->th.th_hot_teams) {
3926  __kmp_free(th->th.th_hot_teams);
3927  th->th.th_hot_teams = NULL;
3928  }
3929  }
3930  }
3931 #endif
3932  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3933 
3934  // Before we can reap the thread, we need to make certain that all other
3935  // threads in the teams that had this root as ancestor have stopped trying to
3936  // steal tasks.
3937  if (__kmp_tasking_mode != tskm_immediate_exec) {
3938  __kmp_wait_to_unref_task_teams();
3939  }
3940 
3941 #if KMP_OS_WINDOWS
3942  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3943  KA_TRACE(
3944  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3945  "\n",
3946  (LPVOID) & (root->r.r_uber_thread->th),
3947  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3948  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3949 #endif /* KMP_OS_WINDOWS */
3950 
3951 #if OMPT_SUPPORT
3952  ompt_data_t *task_data;
3953  ompt_data_t *parallel_data;
3954  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3955  if (ompt_enabled.ompt_callback_implicit_task) {
3956  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3957  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3958  }
3959  if (ompt_enabled.ompt_callback_thread_end) {
3960  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3961  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3962  }
3963 #endif
3964 
3965  TCW_4(__kmp_nth,
3966  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3967  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3968  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3969  " to %d\n",
3970  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3971  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3972  if (i == 1) {
3973  // need to free contention group structure
3974  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3975  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3976  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3977  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3978  root->r.r_uber_thread->th.th_cg_roots = NULL;
3979  }
3980  __kmp_reap_thread(root->r.r_uber_thread, 1);
3981 
3982  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3983  // instead of freeing.
3984  root->r.r_uber_thread = NULL;
3985  /* mark root as no longer in use */
3986  root->r.r_begin = FALSE;
3987 
3988  return n;
3989 }
3990 
3991 void __kmp_unregister_root_current_thread(int gtid) {
3992  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3993  /* this lock should be ok, since unregister_root_current_thread is never
3994  called during an abort, only during a normal close. furthermore, if you
3995  have the forkjoin lock, you should never try to get the initz lock */
3996  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3997  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3998  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3999  "exiting T#%d\n",
4000  gtid));
4001  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4002  return;
4003  }
4004  kmp_root_t *root = __kmp_root[gtid];
4005 
4006  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4007  KMP_ASSERT(KMP_UBER_GTID(gtid));
4008  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4009  KMP_ASSERT(root->r.r_active == FALSE);
4010 
4011  KMP_MB();
4012 
4013  kmp_info_t *thread = __kmp_threads[gtid];
4014  kmp_team_t *team = thread->th.th_team;
4015  kmp_task_team_t *task_team = thread->th.th_task_team;
4016 
4017  // we need to wait for the proxy tasks before finishing the thread
4018  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4019 #if OMPT_SUPPORT
4020  // the runtime is shutting down so we won't report any events
4021  thread->th.ompt_thread_info.state = ompt_state_undefined;
4022 #endif
4023  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4024  }
4025 
4026  __kmp_reset_root(gtid, root);
4027 
4028  KMP_MB();
4029  KC_TRACE(10,
4030  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4031 
4032  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4033 }
4034 
4035 #if KMP_OS_WINDOWS
4036 /* __kmp_forkjoin_lock must be already held
4037  Unregisters a root thread that is not the current thread. Returns the number
4038  of __kmp_threads entries freed as a result. */
4039 static int __kmp_unregister_root_other_thread(int gtid) {
4040  kmp_root_t *root = __kmp_root[gtid];
4041  int r;
4042 
4043  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4044  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4045  KMP_ASSERT(KMP_UBER_GTID(gtid));
4046  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4047  KMP_ASSERT(root->r.r_active == FALSE);
4048 
4049  r = __kmp_reset_root(gtid, root);
4050  KC_TRACE(10,
4051  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4052  return r;
4053 }
4054 #endif
4055 
4056 #if KMP_DEBUG
4057 void __kmp_task_info() {
4058 
4059  kmp_int32 gtid = __kmp_entry_gtid();
4060  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4061  kmp_info_t *this_thr = __kmp_threads[gtid];
4062  kmp_team_t *steam = this_thr->th.th_serial_team;
4063  kmp_team_t *team = this_thr->th.th_team;
4064 
4065  __kmp_printf(
4066  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4067  "ptask=%p\n",
4068  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4069  team->t.t_implicit_task_taskdata[tid].td_parent);
4070 }
4071 #endif // KMP_DEBUG
4072 
4073 /* TODO optimize with one big memclr, take out what isn't needed, split
4074  responsibility to workers as much as possible, and delay initialization of
4075  features as much as possible */
4076 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4077  int tid, int gtid) {
4078  /* this_thr->th.th_info.ds.ds_gtid is setup in
4079  kmp_allocate_thread/create_worker.
4080  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4081  kmp_info_t *master = team->t.t_threads[0];
4082  KMP_DEBUG_ASSERT(this_thr != NULL);
4083  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4084  KMP_DEBUG_ASSERT(team);
4085  KMP_DEBUG_ASSERT(team->t.t_threads);
4086  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4087  KMP_DEBUG_ASSERT(master);
4088  KMP_DEBUG_ASSERT(master->th.th_root);
4089 
4090  KMP_MB();
4091 
4092  TCW_SYNC_PTR(this_thr->th.th_team, team);
4093 
4094  this_thr->th.th_info.ds.ds_tid = tid;
4095  this_thr->th.th_set_nproc = 0;
4096  if (__kmp_tasking_mode != tskm_immediate_exec)
4097  // When tasking is possible, threads are not safe to reap until they are
4098  // done tasking; this will be set when tasking code is exited in wait
4099  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4100  else // no tasking --> always safe to reap
4101  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4102  this_thr->th.th_set_proc_bind = proc_bind_default;
4103 #if KMP_AFFINITY_SUPPORTED
4104  this_thr->th.th_new_place = this_thr->th.th_current_place;
4105 #endif
4106  this_thr->th.th_root = master->th.th_root;
4107 
4108  /* setup the thread's cache of the team structure */
4109  this_thr->th.th_team_nproc = team->t.t_nproc;
4110  this_thr->th.th_team_master = master;
4111  this_thr->th.th_team_serialized = team->t.t_serialized;
4112  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4113 
4114  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4115 
4116  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4117  tid, gtid, this_thr, this_thr->th.th_current_task));
4118 
4119  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4120  team, tid, TRUE);
4121 
4122  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4123  tid, gtid, this_thr, this_thr->th.th_current_task));
4124  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4125  // __kmp_initialize_team()?
4126 
4127  /* TODO no worksharing in speculative threads */
4128  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4129 
4130  this_thr->th.th_local.this_construct = 0;
4131 
4132  if (!this_thr->th.th_pri_common) {
4133  this_thr->th.th_pri_common =
4134  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4135  if (__kmp_storage_map) {
4136  __kmp_print_storage_map_gtid(
4137  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4138  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4139  }
4140  this_thr->th.th_pri_head = NULL;
4141  }
4142 
4143  if (this_thr != master && // Master's CG root is initialized elsewhere
4144  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4145  // Make new thread's CG root same as master's
4146  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4147  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4148  if (tmp) {
4149  // worker changes CG, need to check if old CG should be freed
4150  int i = tmp->cg_nthreads--;
4151  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4152  " on node %p of thread %p to %d\n",
4153  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4154  if (i == 1) {
4155  __kmp_free(tmp); // last thread left CG --> free it
4156  }
4157  }
4158  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4159  // Increment new thread's CG root's counter to add the new thread
4160  this_thr->th.th_cg_roots->cg_nthreads++;
4161  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4162  " node %p of thread %p to %d\n",
4163  this_thr, this_thr->th.th_cg_roots,
4164  this_thr->th.th_cg_roots->cg_root,
4165  this_thr->th.th_cg_roots->cg_nthreads));
4166  this_thr->th.th_current_task->td_icvs.thread_limit =
4167  this_thr->th.th_cg_roots->cg_thread_limit;
4168  }
4169 
4170  /* Initialize dynamic dispatch */
4171  {
4172  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4173  // Use team max_nproc since this will never change for the team.
4174  size_t disp_size =
4175  sizeof(dispatch_private_info_t) *
4176  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4177  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4178  team->t.t_max_nproc));
4179  KMP_ASSERT(dispatch);
4180  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4181  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4182 
4183  dispatch->th_disp_index = 0;
4184  dispatch->th_doacross_buf_idx = 0;
4185  if (!dispatch->th_disp_buffer) {
4186  dispatch->th_disp_buffer =
4187  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4188 
4189  if (__kmp_storage_map) {
4190  __kmp_print_storage_map_gtid(
4191  gtid, &dispatch->th_disp_buffer[0],
4192  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4193  ? 1
4194  : __kmp_dispatch_num_buffers],
4195  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4196  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4197  gtid, team->t.t_id, gtid);
4198  }
4199  } else {
4200  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4201  }
4202 
4203  dispatch->th_dispatch_pr_current = 0;
4204  dispatch->th_dispatch_sh_current = 0;
4205 
4206  dispatch->th_deo_fcn = 0; /* ORDERED */
4207  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4208  }
4209 
4210  this_thr->th.th_next_pool = NULL;
4211 
4212  if (!this_thr->th.th_task_state_memo_stack) {
4213  size_t i;
4214  this_thr->th.th_task_state_memo_stack =
4215  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4216  this_thr->th.th_task_state_top = 0;
4217  this_thr->th.th_task_state_stack_sz = 4;
4218  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4219  ++i) // zero init the stack
4220  this_thr->th.th_task_state_memo_stack[i] = 0;
4221  }
4222 
4223  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4224  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4225 
4226  KMP_MB();
4227 }
4228 
4229 /* allocate a new thread for the requesting team. this is only called from
4230  within a forkjoin critical section. we will first try to get an available
4231  thread from the thread pool. if none is available, we will fork a new one
4232  assuming we are able to create a new one. this should be assured, as the
4233  caller should check on this first. */
4234 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4235  int new_tid) {
4236  kmp_team_t *serial_team;
4237  kmp_info_t *new_thr;
4238  int new_gtid;
4239 
4240  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4241  KMP_DEBUG_ASSERT(root && team);
4242 #if !KMP_NESTED_HOT_TEAMS
4243  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4244 #endif
4245  KMP_MB();
4246 
4247  /* first, try to get one from the thread pool */
4248  if (__kmp_thread_pool) {
4249  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4250  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4251  if (new_thr == __kmp_thread_pool_insert_pt) {
4252  __kmp_thread_pool_insert_pt = NULL;
4253  }
4254  TCW_4(new_thr->th.th_in_pool, FALSE);
4255  __kmp_suspend_initialize_thread(new_thr);
4256  __kmp_lock_suspend_mx(new_thr);
4257  if (new_thr->th.th_active_in_pool == TRUE) {
4258  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4259  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4260  new_thr->th.th_active_in_pool = FALSE;
4261  }
4262  __kmp_unlock_suspend_mx(new_thr);
4263 
4264  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4265  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4266  KMP_ASSERT(!new_thr->th.th_team);
4267  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4268 
4269  /* setup the thread structure */
4270  __kmp_initialize_info(new_thr, team, new_tid,
4271  new_thr->th.th_info.ds.ds_gtid);
4272  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4273 
4274  TCW_4(__kmp_nth, __kmp_nth + 1);
4275 
4276  new_thr->th.th_task_state = 0;
4277  new_thr->th.th_task_state_top = 0;
4278  new_thr->th.th_task_state_stack_sz = 4;
4279 
4280 #ifdef KMP_ADJUST_BLOCKTIME
4281  /* Adjust blocktime back to zero if necessary */
4282  /* Middle initialization might not have occurred yet */
4283  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4284  if (__kmp_nth > __kmp_avail_proc) {
4285  __kmp_zero_bt = TRUE;
4286  }
4287  }
4288 #endif /* KMP_ADJUST_BLOCKTIME */
4289 
4290 #if KMP_DEBUG
4291  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4292  // KMP_BARRIER_PARENT_FLAG.
4293  int b;
4294  kmp_balign_t *balign = new_thr->th.th_bar;
4295  for (b = 0; b < bs_last_barrier; ++b)
4296  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4297 #endif
4298 
4299  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4300  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4301 
4302  KMP_MB();
4303  return new_thr;
4304  }
4305 
4306  /* no, well fork a new one */
4307  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4308  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4309 
4310 #if KMP_USE_MONITOR
4311  // If this is the first worker thread the RTL is creating, then also
4312  // launch the monitor thread. We try to do this as early as possible.
4313  if (!TCR_4(__kmp_init_monitor)) {
4314  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4315  if (!TCR_4(__kmp_init_monitor)) {
4316  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4317  TCW_4(__kmp_init_monitor, 1);
4318  __kmp_create_monitor(&__kmp_monitor);
4319  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4320 #if KMP_OS_WINDOWS
4321  // AC: wait until monitor has started. This is a fix for CQ232808.
4322  // The reason is that if the library is loaded/unloaded in a loop with
4323  // small (parallel) work in between, then there is high probability that
4324  // monitor thread started after the library shutdown. At shutdown it is
4325  // too late to cope with the problem, because when the master is in
4326  // DllMain (process detach) the monitor has no chances to start (it is
4327  // blocked), and master has no means to inform the monitor that the
4328  // library has gone, because all the memory which the monitor can access
4329  // is going to be released/reset.
4330  while (TCR_4(__kmp_init_monitor) < 2) {
4331  KMP_YIELD(TRUE);
4332  }
4333  KF_TRACE(10, ("after monitor thread has started\n"));
4334 #endif
4335  }
4336  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4337  }
4338 #endif
4339 
4340  KMP_MB();
4341 
4342  {
4343  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4344  ? 1
4345  : __kmp_hidden_helper_threads_num + 1;
4346 
4347  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4348  ++new_gtid) {
4349  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4350  }
4351 
4352  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4353  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4354  }
4355  }
4356 
4357  /* allocate space for it. */
4358  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4359 
4360  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4361 
4362 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4363  // suppress race conditions detection on synchronization flags in debug mode
4364  // this helps to analyze library internals eliminating false positives
4365  __itt_suppress_mark_range(
4366  __itt_suppress_range, __itt_suppress_threading_errors,
4367  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4368  __itt_suppress_mark_range(
4369  __itt_suppress_range, __itt_suppress_threading_errors,
4370  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4371 #if KMP_OS_WINDOWS
4372  __itt_suppress_mark_range(
4373  __itt_suppress_range, __itt_suppress_threading_errors,
4374  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4375 #else
4376  __itt_suppress_mark_range(__itt_suppress_range,
4377  __itt_suppress_threading_errors,
4378  &new_thr->th.th_suspend_init_count,
4379  sizeof(new_thr->th.th_suspend_init_count));
4380 #endif
4381  // TODO: check if we need to also suppress b_arrived flags
4382  __itt_suppress_mark_range(__itt_suppress_range,
4383  __itt_suppress_threading_errors,
4384  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4385  sizeof(new_thr->th.th_bar[0].bb.b_go));
4386  __itt_suppress_mark_range(__itt_suppress_range,
4387  __itt_suppress_threading_errors,
4388  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4389  sizeof(new_thr->th.th_bar[1].bb.b_go));
4390  __itt_suppress_mark_range(__itt_suppress_range,
4391  __itt_suppress_threading_errors,
4392  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4393  sizeof(new_thr->th.th_bar[2].bb.b_go));
4394 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4395  if (__kmp_storage_map) {
4396  __kmp_print_thread_storage_map(new_thr, new_gtid);
4397  }
4398 
4399  // add the reserve serialized team, initialized from the team's master thread
4400  {
4401  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4402  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4403  new_thr->th.th_serial_team = serial_team =
4404  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4405 #if OMPT_SUPPORT
4406  ompt_data_none, // root parallel id
4407 #endif
4408  proc_bind_default, &r_icvs,
4409  0 USE_NESTED_HOT_ARG(NULL));
4410  }
4411  KMP_ASSERT(serial_team);
4412  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4413  // execution (it is unused for now).
4414  serial_team->t.t_threads[0] = new_thr;
4415  KF_TRACE(10,
4416  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4417  new_thr));
4418 
4419  /* setup the thread structures */
4420  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4421 
4422 #if USE_FAST_MEMORY
4423  __kmp_initialize_fast_memory(new_thr);
4424 #endif /* USE_FAST_MEMORY */
4425 
4426 #if KMP_USE_BGET
4427  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4428  __kmp_initialize_bget(new_thr);
4429 #endif
4430 
4431  __kmp_init_random(new_thr); // Initialize random number generator
4432 
4433  /* Initialize these only once when thread is grabbed for a team allocation */
4434  KA_TRACE(20,
4435  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4436  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4437 
4438  int b;
4439  kmp_balign_t *balign = new_thr->th.th_bar;
4440  for (b = 0; b < bs_last_barrier; ++b) {
4441  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4442  balign[b].bb.team = NULL;
4443  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4444  balign[b].bb.use_oncore_barrier = 0;
4445  }
4446 
4447  new_thr->th.th_spin_here = FALSE;
4448  new_thr->th.th_next_waiting = 0;
4449 #if KMP_OS_UNIX
4450  new_thr->th.th_blocking = false;
4451 #endif
4452 
4453 #if KMP_AFFINITY_SUPPORTED
4454  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4455  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4456  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4457  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4458 #endif
4459  new_thr->th.th_def_allocator = __kmp_def_allocator;
4460  new_thr->th.th_prev_level = 0;
4461  new_thr->th.th_prev_num_threads = 1;
4462 
4463  TCW_4(new_thr->th.th_in_pool, FALSE);
4464  new_thr->th.th_active_in_pool = FALSE;
4465  TCW_4(new_thr->th.th_active, TRUE);
4466 
4467  /* adjust the global counters */
4468  __kmp_all_nth++;
4469  __kmp_nth++;
4470 
4471  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4472  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4473  if (__kmp_adjust_gtid_mode) {
4474  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4475  if (TCR_4(__kmp_gtid_mode) != 2) {
4476  TCW_4(__kmp_gtid_mode, 2);
4477  }
4478  } else {
4479  if (TCR_4(__kmp_gtid_mode) != 1) {
4480  TCW_4(__kmp_gtid_mode, 1);
4481  }
4482  }
4483  }
4484 
4485 #ifdef KMP_ADJUST_BLOCKTIME
4486  /* Adjust blocktime back to zero if necessary */
4487  /* Middle initialization might not have occurred yet */
4488  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4489  if (__kmp_nth > __kmp_avail_proc) {
4490  __kmp_zero_bt = TRUE;
4491  }
4492  }
4493 #endif /* KMP_ADJUST_BLOCKTIME */
4494 
4495  /* actually fork it and create the new worker thread */
4496  KF_TRACE(
4497  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4498  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4499  KF_TRACE(10,
4500  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4501 
4502  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4503  new_gtid));
4504  KMP_MB();
4505  return new_thr;
4506 }
4507 
4508 /* Reinitialize team for reuse.
4509  The hot team code calls this case at every fork barrier, so EPCC barrier
4510  test are extremely sensitive to changes in it, esp. writes to the team
4511  struct, which cause a cache invalidation in all threads.
4512  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4513 static void __kmp_reinitialize_team(kmp_team_t *team,
4514  kmp_internal_control_t *new_icvs,
4515  ident_t *loc) {
4516  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4517  team->t.t_threads[0], team));
4518  KMP_DEBUG_ASSERT(team && new_icvs);
4519  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4520  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4521 
4522  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4523  // Copy ICVs to the master thread's implicit taskdata
4524  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4525  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4526 
4527  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4528  team->t.t_threads[0], team));
4529 }
4530 
4531 /* Initialize the team data structure.
4532  This assumes the t_threads and t_max_nproc are already set.
4533  Also, we don't touch the arguments */
4534 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4535  kmp_internal_control_t *new_icvs,
4536  ident_t *loc) {
4537  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4538 
4539  /* verify */
4540  KMP_DEBUG_ASSERT(team);
4541  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4542  KMP_DEBUG_ASSERT(team->t.t_threads);
4543  KMP_MB();
4544 
4545  team->t.t_master_tid = 0; /* not needed */
4546  /* team->t.t_master_bar; not needed */
4547  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4548  team->t.t_nproc = new_nproc;
4549 
4550  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4551  team->t.t_next_pool = NULL;
4552  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4553  * up hot team */
4554 
4555  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4556  team->t.t_invoke = NULL; /* not needed */
4557 
4558  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4559  team->t.t_sched.sched = new_icvs->sched.sched;
4560 
4561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4562  team->t.t_fp_control_saved = FALSE; /* not needed */
4563  team->t.t_x87_fpu_control_word = 0; /* not needed */
4564  team->t.t_mxcsr = 0; /* not needed */
4565 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4566 
4567  team->t.t_construct = 0;
4568 
4569  team->t.t_ordered.dt.t_value = 0;
4570  team->t.t_master_active = FALSE;
4571 
4572 #ifdef KMP_DEBUG
4573  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4574 #endif
4575 #if KMP_OS_WINDOWS
4576  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4577 #endif
4578 
4579  team->t.t_control_stack_top = NULL;
4580 
4581  __kmp_reinitialize_team(team, new_icvs, loc);
4582 
4583  KMP_MB();
4584  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4585 }
4586 
4587 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4588 /* Sets full mask for thread and returns old mask, no changes to structures. */
4589 static void
4590 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4591  if (KMP_AFFINITY_CAPABLE()) {
4592  int status;
4593  if (old_mask != NULL) {
4594  status = __kmp_get_system_affinity(old_mask, TRUE);
4595  int error = errno;
4596  if (status != 0) {
4597  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4598  __kmp_msg_null);
4599  }
4600  }
4601  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4602  }
4603 }
4604 #endif
4605 
4606 #if KMP_AFFINITY_SUPPORTED
4607 
4608 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4609 // It calculates the worker + master thread's partition based upon the parent
4610 // thread's partition, and binds each worker to a thread in their partition.
4611 // The master thread's partition should already include its current binding.
4612 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4613  // Copy the master thread's place partition to the team struct
4614  kmp_info_t *master_th = team->t.t_threads[0];
4615  KMP_DEBUG_ASSERT(master_th != NULL);
4616  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4617  int first_place = master_th->th.th_first_place;
4618  int last_place = master_th->th.th_last_place;
4619  int masters_place = master_th->th.th_current_place;
4620  team->t.t_first_place = first_place;
4621  team->t.t_last_place = last_place;
4622 
4623  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4624  "bound to place %d partition = [%d,%d]\n",
4625  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4626  team->t.t_id, masters_place, first_place, last_place));
4627 
4628  switch (proc_bind) {
4629 
4630  case proc_bind_default:
4631  // serial teams might have the proc_bind policy set to proc_bind_default. It
4632  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4633  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4634  break;
4635 
4636  case proc_bind_master: {
4637  int f;
4638  int n_th = team->t.t_nproc;
4639  for (f = 1; f < n_th; f++) {
4640  kmp_info_t *th = team->t.t_threads[f];
4641  KMP_DEBUG_ASSERT(th != NULL);
4642  th->th.th_first_place = first_place;
4643  th->th.th_last_place = last_place;
4644  th->th.th_new_place = masters_place;
4645  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4646  team->t.t_display_affinity != 1) {
4647  team->t.t_display_affinity = 1;
4648  }
4649 
4650  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4651  "partition = [%d,%d]\n",
4652  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4653  f, masters_place, first_place, last_place));
4654  }
4655  } break;
4656 
4657  case proc_bind_close: {
4658  int f;
4659  int n_th = team->t.t_nproc;
4660  int n_places;
4661  if (first_place <= last_place) {
4662  n_places = last_place - first_place + 1;
4663  } else {
4664  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4665  }
4666  if (n_th <= n_places) {
4667  int place = masters_place;
4668  for (f = 1; f < n_th; f++) {
4669  kmp_info_t *th = team->t.t_threads[f];
4670  KMP_DEBUG_ASSERT(th != NULL);
4671 
4672  if (place == last_place) {
4673  place = first_place;
4674  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4675  place = 0;
4676  } else {
4677  place++;
4678  }
4679  th->th.th_first_place = first_place;
4680  th->th.th_last_place = last_place;
4681  th->th.th_new_place = place;
4682  if (__kmp_display_affinity && place != th->th.th_current_place &&
4683  team->t.t_display_affinity != 1) {
4684  team->t.t_display_affinity = 1;
4685  }
4686 
4687  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4688  "partition = [%d,%d]\n",
4689  __kmp_gtid_from_thread(team->t.t_threads[f]),
4690  team->t.t_id, f, place, first_place, last_place));
4691  }
4692  } else {
4693  int S, rem, gap, s_count;
4694  S = n_th / n_places;
4695  s_count = 0;
4696  rem = n_th - (S * n_places);
4697  gap = rem > 0 ? n_places / rem : n_places;
4698  int place = masters_place;
4699  int gap_ct = gap;
4700  for (f = 0; f < n_th; f++) {
4701  kmp_info_t *th = team->t.t_threads[f];
4702  KMP_DEBUG_ASSERT(th != NULL);
4703 
4704  th->th.th_first_place = first_place;
4705  th->th.th_last_place = last_place;
4706  th->th.th_new_place = place;
4707  if (__kmp_display_affinity && place != th->th.th_current_place &&
4708  team->t.t_display_affinity != 1) {
4709  team->t.t_display_affinity = 1;
4710  }
4711  s_count++;
4712 
4713  if ((s_count == S) && rem && (gap_ct == gap)) {
4714  // do nothing, add an extra thread to place on next iteration
4715  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4716  // we added an extra thread to this place; move to next place
4717  if (place == last_place) {
4718  place = first_place;
4719  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4720  place = 0;
4721  } else {
4722  place++;
4723  }
4724  s_count = 0;
4725  gap_ct = 1;
4726  rem--;
4727  } else if (s_count == S) { // place full; don't add extra
4728  if (place == last_place) {
4729  place = first_place;
4730  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4731  place = 0;
4732  } else {
4733  place++;
4734  }
4735  gap_ct++;
4736  s_count = 0;
4737  }
4738 
4739  KA_TRACE(100,
4740  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4741  "partition = [%d,%d]\n",
4742  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4743  th->th.th_new_place, first_place, last_place));
4744  }
4745  KMP_DEBUG_ASSERT(place == masters_place);
4746  }
4747  } break;
4748 
4749  case proc_bind_spread: {
4750  int f;
4751  int n_th = team->t.t_nproc;
4752  int n_places;
4753  int thidx;
4754  if (first_place <= last_place) {
4755  n_places = last_place - first_place + 1;
4756  } else {
4757  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4758  }
4759  if (n_th <= n_places) {
4760  int place = -1;
4761 
4762  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4763  int S = n_places / n_th;
4764  int s_count, rem, gap, gap_ct;
4765 
4766  place = masters_place;
4767  rem = n_places - n_th * S;
4768  gap = rem ? n_th / rem : 1;
4769  gap_ct = gap;
4770  thidx = n_th;
4771  if (update_master_only == 1)
4772  thidx = 1;
4773  for (f = 0; f < thidx; f++) {
4774  kmp_info_t *th = team->t.t_threads[f];
4775  KMP_DEBUG_ASSERT(th != NULL);
4776 
4777  th->th.th_first_place = place;
4778  th->th.th_new_place = place;
4779  if (__kmp_display_affinity && place != th->th.th_current_place &&
4780  team->t.t_display_affinity != 1) {
4781  team->t.t_display_affinity = 1;
4782  }
4783  s_count = 1;
4784  while (s_count < S) {
4785  if (place == last_place) {
4786  place = first_place;
4787  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4788  place = 0;
4789  } else {
4790  place++;
4791  }
4792  s_count++;
4793  }
4794  if (rem && (gap_ct == gap)) {
4795  if (place == last_place) {
4796  place = first_place;
4797  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798  place = 0;
4799  } else {
4800  place++;
4801  }
4802  rem--;
4803  gap_ct = 0;
4804  }
4805  th->th.th_last_place = place;
4806  gap_ct++;
4807 
4808  if (place == last_place) {
4809  place = first_place;
4810  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4811  place = 0;
4812  } else {
4813  place++;
4814  }
4815 
4816  KA_TRACE(100,
4817  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4818  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4819  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4820  f, th->th.th_new_place, th->th.th_first_place,
4821  th->th.th_last_place, __kmp_affinity_num_masks));
4822  }
4823  } else {
4824  /* Having uniform space of available computation places I can create
4825  T partitions of round(P/T) size and put threads into the first
4826  place of each partition. */
4827  double current = static_cast<double>(masters_place);
4828  double spacing =
4829  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4830  int first, last;
4831  kmp_info_t *th;
4832 
4833  thidx = n_th + 1;
4834  if (update_master_only == 1)
4835  thidx = 1;
4836  for (f = 0; f < thidx; f++) {
4837  first = static_cast<int>(current);
4838  last = static_cast<int>(current + spacing) - 1;
4839  KMP_DEBUG_ASSERT(last >= first);
4840  if (first >= n_places) {
4841  if (masters_place) {
4842  first -= n_places;
4843  last -= n_places;
4844  if (first == (masters_place + 1)) {
4845  KMP_DEBUG_ASSERT(f == n_th);
4846  first--;
4847  }
4848  if (last == masters_place) {
4849  KMP_DEBUG_ASSERT(f == (n_th - 1));
4850  last--;
4851  }
4852  } else {
4853  KMP_DEBUG_ASSERT(f == n_th);
4854  first = 0;
4855  last = 0;
4856  }
4857  }
4858  if (last >= n_places) {
4859  last = (n_places - 1);
4860  }
4861  place = first;
4862  current += spacing;
4863  if (f < n_th) {
4864  KMP_DEBUG_ASSERT(0 <= first);
4865  KMP_DEBUG_ASSERT(n_places > first);
4866  KMP_DEBUG_ASSERT(0 <= last);
4867  KMP_DEBUG_ASSERT(n_places > last);
4868  KMP_DEBUG_ASSERT(last_place >= first_place);
4869  th = team->t.t_threads[f];
4870  KMP_DEBUG_ASSERT(th);
4871  th->th.th_first_place = first;
4872  th->th.th_new_place = place;
4873  th->th.th_last_place = last;
4874  if (__kmp_display_affinity && place != th->th.th_current_place &&
4875  team->t.t_display_affinity != 1) {
4876  team->t.t_display_affinity = 1;
4877  }
4878  KA_TRACE(100,
4879  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4880  "partition = [%d,%d], spacing = %.4f\n",
4881  __kmp_gtid_from_thread(team->t.t_threads[f]),
4882  team->t.t_id, f, th->th.th_new_place,
4883  th->th.th_first_place, th->th.th_last_place, spacing));
4884  }
4885  }
4886  }
4887  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4888  } else {
4889  int S, rem, gap, s_count;
4890  S = n_th / n_places;
4891  s_count = 0;
4892  rem = n_th - (S * n_places);
4893  gap = rem > 0 ? n_places / rem : n_places;
4894  int place = masters_place;
4895  int gap_ct = gap;
4896  thidx = n_th;
4897  if (update_master_only == 1)
4898  thidx = 1;
4899  for (f = 0; f < thidx; f++) {
4900  kmp_info_t *th = team->t.t_threads[f];
4901  KMP_DEBUG_ASSERT(th != NULL);
4902 
4903  th->th.th_first_place = place;
4904  th->th.th_last_place = place;
4905  th->th.th_new_place = place;
4906  if (__kmp_display_affinity && place != th->th.th_current_place &&
4907  team->t.t_display_affinity != 1) {
4908  team->t.t_display_affinity = 1;
4909  }
4910  s_count++;
4911 
4912  if ((s_count == S) && rem && (gap_ct == gap)) {
4913  // do nothing, add an extra thread to place on next iteration
4914  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4915  // we added an extra thread to this place; move on to next place
4916  if (place == last_place) {
4917  place = first_place;
4918  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4919  place = 0;
4920  } else {
4921  place++;
4922  }
4923  s_count = 0;
4924  gap_ct = 1;
4925  rem--;
4926  } else if (s_count == S) { // place is full; don't add extra thread
4927  if (place == last_place) {
4928  place = first_place;
4929  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4930  place = 0;
4931  } else {
4932  place++;
4933  }
4934  gap_ct++;
4935  s_count = 0;
4936  }
4937 
4938  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4939  "partition = [%d,%d]\n",
4940  __kmp_gtid_from_thread(team->t.t_threads[f]),
4941  team->t.t_id, f, th->th.th_new_place,
4942  th->th.th_first_place, th->th.th_last_place));
4943  }
4944  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4945  }
4946  } break;
4947 
4948  default:
4949  break;
4950  }
4951 
4952  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4953 }
4954 
4955 #endif // KMP_AFFINITY_SUPPORTED
4956 
4957 /* allocate a new team data structure to use. take one off of the free pool if
4958  available */
4959 kmp_team_t *
4960 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4961 #if OMPT_SUPPORT
4962  ompt_data_t ompt_parallel_data,
4963 #endif
4964  kmp_proc_bind_t new_proc_bind,
4965  kmp_internal_control_t *new_icvs,
4966  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4967  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4968  int f;
4969  kmp_team_t *team;
4970  int use_hot_team = !root->r.r_active;
4971  int level = 0;
4972 
4973  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4974  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4975  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4976  KMP_MB();
4977 
4978 #if KMP_NESTED_HOT_TEAMS
4979  kmp_hot_team_ptr_t *hot_teams;
4980  if (master) {
4981  team = master->th.th_team;
4982  level = team->t.t_active_level;
4983  if (master->th.th_teams_microtask) { // in teams construct?
4984  if (master->th.th_teams_size.nteams > 1 &&
4985  ( // #teams > 1
4986  team->t.t_pkfn ==
4987  (microtask_t)__kmp_teams_master || // inner fork of the teams
4988  master->th.th_teams_level <
4989  team->t.t_level)) { // or nested parallel inside the teams
4990  ++level; // not increment if #teams==1, or for outer fork of the teams;
4991  // increment otherwise
4992  }
4993  }
4994  hot_teams = master->th.th_hot_teams;
4995  if (level < __kmp_hot_teams_max_level && hot_teams &&
4996  hot_teams[level].hot_team) {
4997  // hot team has already been allocated for given level
4998  use_hot_team = 1;
4999  } else {
5000  use_hot_team = 0;
5001  }
5002  } else {
5003  // check we won't access uninitialized hot_teams, just in case
5004  KMP_DEBUG_ASSERT(new_nproc == 1);
5005  }
5006 #endif
5007  // Optimization to use a "hot" team
5008  if (use_hot_team && new_nproc > 1) {
5009  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5010 #if KMP_NESTED_HOT_TEAMS
5011  team = hot_teams[level].hot_team;
5012 #else
5013  team = root->r.r_hot_team;
5014 #endif
5015 #if KMP_DEBUG
5016  if (__kmp_tasking_mode != tskm_immediate_exec) {
5017  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5018  "task_team[1] = %p before reinit\n",
5019  team->t.t_task_team[0], team->t.t_task_team[1]));
5020  }
5021 #endif
5022 
5023  // Has the number of threads changed?
5024  /* Let's assume the most common case is that the number of threads is
5025  unchanged, and put that case first. */
5026  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5027  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5028  // This case can mean that omp_set_num_threads() was called and the hot
5029  // team size was already reduced, so we check the special flag
5030  if (team->t.t_size_changed == -1) {
5031  team->t.t_size_changed = 1;
5032  } else {
5033  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5034  }
5035 
5036  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5037  kmp_r_sched_t new_sched = new_icvs->sched;
5038  // set master's schedule as new run-time schedule
5039  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5040 
5041  __kmp_reinitialize_team(team, new_icvs,
5042  root->r.r_uber_thread->th.th_ident);
5043 
5044  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5045  team->t.t_threads[0], team));
5046  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5047 
5048 #if KMP_AFFINITY_SUPPORTED
5049  if ((team->t.t_size_changed == 0) &&
5050  (team->t.t_proc_bind == new_proc_bind)) {
5051  if (new_proc_bind == proc_bind_spread) {
5052  __kmp_partition_places(
5053  team, 1); // add flag to update only master for spread
5054  }
5055  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5056  "proc_bind = %d, partition = [%d,%d]\n",
5057  team->t.t_id, new_proc_bind, team->t.t_first_place,
5058  team->t.t_last_place));
5059  } else {
5060  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5061  __kmp_partition_places(team);
5062  }
5063 #else
5064  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5065 #endif /* KMP_AFFINITY_SUPPORTED */
5066  } else if (team->t.t_nproc > new_nproc) {
5067  KA_TRACE(20,
5068  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5069  new_nproc));
5070 
5071  team->t.t_size_changed = 1;
5072 #if KMP_NESTED_HOT_TEAMS
5073  if (__kmp_hot_teams_mode == 0) {
5074  // AC: saved number of threads should correspond to team's value in this
5075  // mode, can be bigger in mode 1, when hot team has threads in reserve
5076  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5077  hot_teams[level].hot_team_nth = new_nproc;
5078 #endif // KMP_NESTED_HOT_TEAMS
5079  /* release the extra threads we don't need any more */
5080  for (f = new_nproc; f < team->t.t_nproc; f++) {
5081  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5082  if (__kmp_tasking_mode != tskm_immediate_exec) {
5083  // When decreasing team size, threads no longer in the team should
5084  // unref task team.
5085  team->t.t_threads[f]->th.th_task_team = NULL;
5086  }
5087  __kmp_free_thread(team->t.t_threads[f]);
5088  team->t.t_threads[f] = NULL;
5089  }
5090 #if KMP_NESTED_HOT_TEAMS
5091  } // (__kmp_hot_teams_mode == 0)
5092  else {
5093  // When keeping extra threads in team, switch threads to wait on own
5094  // b_go flag
5095  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5096  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5097  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5098  for (int b = 0; b < bs_last_barrier; ++b) {
5099  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5100  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5101  }
5102  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5103  }
5104  }
5105  }
5106 #endif // KMP_NESTED_HOT_TEAMS
5107  team->t.t_nproc = new_nproc;
5108  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5109  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5110  __kmp_reinitialize_team(team, new_icvs,
5111  root->r.r_uber_thread->th.th_ident);
5112 
5113  // Update remaining threads
5114  for (f = 0; f < new_nproc; ++f) {
5115  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5116  }
5117 
5118  // restore the current task state of the master thread: should be the
5119  // implicit task
5120  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5121  team->t.t_threads[0], team));
5122 
5123  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5124 
5125 #ifdef KMP_DEBUG
5126  for (f = 0; f < team->t.t_nproc; f++) {
5127  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5128  team->t.t_threads[f]->th.th_team_nproc ==
5129  team->t.t_nproc);
5130  }
5131 #endif
5132 
5133  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5134 #if KMP_AFFINITY_SUPPORTED
5135  __kmp_partition_places(team);
5136 #endif
5137  } else { // team->t.t_nproc < new_nproc
5138 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5139  kmp_affin_mask_t *old_mask;
5140  if (KMP_AFFINITY_CAPABLE()) {
5141  KMP_CPU_ALLOC(old_mask);
5142  }
5143 #endif
5144 
5145  KA_TRACE(20,
5146  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5147  new_nproc));
5148 
5149  team->t.t_size_changed = 1;
5150 
5151 #if KMP_NESTED_HOT_TEAMS
5152  int avail_threads = hot_teams[level].hot_team_nth;
5153  if (new_nproc < avail_threads)
5154  avail_threads = new_nproc;
5155  kmp_info_t **other_threads = team->t.t_threads;
5156  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5157  // Adjust barrier data of reserved threads (if any) of the team
5158  // Other data will be set in __kmp_initialize_info() below.
5159  int b;
5160  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5161  for (b = 0; b < bs_last_barrier; ++b) {
5162  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5163  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5164 #if USE_DEBUGGER
5165  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5166 #endif
5167  }
5168  }
5169  if (hot_teams[level].hot_team_nth >= new_nproc) {
5170  // we have all needed threads in reserve, no need to allocate any
5171  // this only possible in mode 1, cannot have reserved threads in mode 0
5172  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5173  team->t.t_nproc = new_nproc; // just get reserved threads involved
5174  } else {
5175  // we may have some threads in reserve, but not enough
5176  team->t.t_nproc =
5177  hot_teams[level]
5178  .hot_team_nth; // get reserved threads involved if any
5179  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5180 #endif // KMP_NESTED_HOT_TEAMS
5181  if (team->t.t_max_nproc < new_nproc) {
5182  /* reallocate larger arrays */
5183  __kmp_reallocate_team_arrays(team, new_nproc);
5184  __kmp_reinitialize_team(team, new_icvs, NULL);
5185  }
5186 
5187 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5188  /* Temporarily set full mask for master thread before creation of
5189  workers. The reason is that workers inherit the affinity from master,
5190  so if a lot of workers are created on the single core quickly, they
5191  don't get a chance to set their own affinity for a long time. */
5192  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5193 #endif
5194 
5195  /* allocate new threads for the hot team */
5196  for (f = team->t.t_nproc; f < new_nproc; f++) {
5197  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5198  KMP_DEBUG_ASSERT(new_worker);
5199  team->t.t_threads[f] = new_worker;
5200 
5201  KA_TRACE(20,
5202  ("__kmp_allocate_team: team %d init T#%d arrived: "
5203  "join=%llu, plain=%llu\n",
5204  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5205  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5206  team->t.t_bar[bs_plain_barrier].b_arrived));
5207 
5208  { // Initialize barrier data for new threads.
5209  int b;
5210  kmp_balign_t *balign = new_worker->th.th_bar;
5211  for (b = 0; b < bs_last_barrier; ++b) {
5212  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5213  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5214  KMP_BARRIER_PARENT_FLAG);
5215 #if USE_DEBUGGER
5216  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5217 #endif
5218  }
5219  }
5220  }
5221 
5222 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5223  if (KMP_AFFINITY_CAPABLE()) {
5224  /* Restore initial master thread's affinity mask */
5225  __kmp_set_system_affinity(old_mask, TRUE);
5226  KMP_CPU_FREE(old_mask);
5227  }
5228 #endif
5229 #if KMP_NESTED_HOT_TEAMS
5230  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5231 #endif // KMP_NESTED_HOT_TEAMS
5232  /* make sure everyone is syncronized */
5233  int old_nproc = team->t.t_nproc; // save old value and use to update only
5234  // new threads below
5235  __kmp_initialize_team(team, new_nproc, new_icvs,
5236  root->r.r_uber_thread->th.th_ident);
5237 
5238  /* reinitialize the threads */
5239  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5240  for (f = 0; f < team->t.t_nproc; ++f)
5241  __kmp_initialize_info(team->t.t_threads[f], team, f,
5242  __kmp_gtid_from_tid(f, team));
5243 
5244  if (level) { // set th_task_state for new threads in nested hot team
5245  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5246  // only need to set the th_task_state for the new threads. th_task_state
5247  // for master thread will not be accurate until after this in
5248  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5249  // correct value.
5250  for (f = old_nproc; f < team->t.t_nproc; ++f)
5251  team->t.t_threads[f]->th.th_task_state =
5252  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5253  } else { // set th_task_state for new threads in non-nested hot team
5254  kmp_uint8 old_state =
5255  team->t.t_threads[0]->th.th_task_state; // copy master's state
5256  for (f = old_nproc; f < team->t.t_nproc; ++f)
5257  team->t.t_threads[f]->th.th_task_state = old_state;
5258  }
5259 
5260 #ifdef KMP_DEBUG
5261  for (f = 0; f < team->t.t_nproc; ++f) {
5262  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5263  team->t.t_threads[f]->th.th_team_nproc ==
5264  team->t.t_nproc);
5265  }
5266 #endif
5267 
5268  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5269 #if KMP_AFFINITY_SUPPORTED
5270  __kmp_partition_places(team);
5271 #endif
5272  } // Check changes in number of threads
5273 
5274  kmp_info_t *master = team->t.t_threads[0];
5275  if (master->th.th_teams_microtask) {
5276  for (f = 1; f < new_nproc; ++f) {
5277  // propagate teams construct specific info to workers
5278  kmp_info_t *thr = team->t.t_threads[f];
5279  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5280  thr->th.th_teams_level = master->th.th_teams_level;
5281  thr->th.th_teams_size = master->th.th_teams_size;
5282  }
5283  }
5284 #if KMP_NESTED_HOT_TEAMS
5285  if (level) {
5286  // Sync barrier state for nested hot teams, not needed for outermost hot
5287  // team.
5288  for (f = 1; f < new_nproc; ++f) {
5289  kmp_info_t *thr = team->t.t_threads[f];
5290  int b;
5291  kmp_balign_t *balign = thr->th.th_bar;
5292  for (b = 0; b < bs_last_barrier; ++b) {
5293  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5294  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5295 #if USE_DEBUGGER
5296  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5297 #endif
5298  }
5299  }
5300  }
5301 #endif // KMP_NESTED_HOT_TEAMS
5302 
5303  /* reallocate space for arguments if necessary */
5304  __kmp_alloc_argv_entries(argc, team, TRUE);
5305  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5306  // The hot team re-uses the previous task team,
5307  // if untouched during the previous release->gather phase.
5308 
5309  KF_TRACE(10, (" hot_team = %p\n", team));
5310 
5311 #if KMP_DEBUG
5312  if (__kmp_tasking_mode != tskm_immediate_exec) {
5313  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5314  "task_team[1] = %p after reinit\n",
5315  team->t.t_task_team[0], team->t.t_task_team[1]));
5316  }
5317 #endif
5318 
5319 #if OMPT_SUPPORT
5320  __ompt_team_assign_id(team, ompt_parallel_data);
5321 #endif
5322 
5323  KMP_MB();
5324 
5325  return team;
5326  }
5327 
5328  /* next, let's try to take one from the team pool */
5329  KMP_MB();
5330  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5331  /* TODO: consider resizing undersized teams instead of reaping them, now
5332  that we have a resizing mechanism */
5333  if (team->t.t_max_nproc >= max_nproc) {
5334  /* take this team from the team pool */
5335  __kmp_team_pool = team->t.t_next_pool;
5336 
5337  /* setup the team for fresh use */
5338  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5339 
5340  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5341  "task_team[1] %p to NULL\n",
5342  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5343  team->t.t_task_team[0] = NULL;
5344  team->t.t_task_team[1] = NULL;
5345 
5346  /* reallocate space for arguments if necessary */
5347  __kmp_alloc_argv_entries(argc, team, TRUE);
5348  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5349 
5350  KA_TRACE(
5351  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5352  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5353  { // Initialize barrier data.
5354  int b;
5355  for (b = 0; b < bs_last_barrier; ++b) {
5356  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5357 #if USE_DEBUGGER
5358  team->t.t_bar[b].b_master_arrived = 0;
5359  team->t.t_bar[b].b_team_arrived = 0;
5360 #endif
5361  }
5362  }
5363 
5364  team->t.t_proc_bind = new_proc_bind;
5365 
5366  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5367  team->t.t_id));
5368 
5369 #if OMPT_SUPPORT
5370  __ompt_team_assign_id(team, ompt_parallel_data);
5371 #endif
5372 
5373  KMP_MB();
5374 
5375  return team;
5376  }
5377 
5378  /* reap team if it is too small, then loop back and check the next one */
5379  // not sure if this is wise, but, will be redone during the hot-teams
5380  // rewrite.
5381  /* TODO: Use technique to find the right size hot-team, don't reap them */
5382  team = __kmp_reap_team(team);
5383  __kmp_team_pool = team;
5384  }
5385 
5386  /* nothing available in the pool, no matter, make a new team! */
5387  KMP_MB();
5388  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5389 
5390  /* and set it up */
5391  team->t.t_max_nproc = max_nproc;
5392  /* NOTE well, for some reason allocating one big buffer and dividing it up
5393  seems to really hurt performance a lot on the P4, so, let's not use this */
5394  __kmp_allocate_team_arrays(team, max_nproc);
5395 
5396  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5397  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5398 
5399  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5400  "%p to NULL\n",
5401  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5402  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5403  // memory, no need to duplicate
5404  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5405  // memory, no need to duplicate
5406 
5407  if (__kmp_storage_map) {
5408  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5409  }
5410 
5411  /* allocate space for arguments */
5412  __kmp_alloc_argv_entries(argc, team, FALSE);
5413  team->t.t_argc = argc;
5414 
5415  KA_TRACE(20,
5416  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5417  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5418  { // Initialize barrier data.
5419  int b;
5420  for (b = 0; b < bs_last_barrier; ++b) {
5421  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5422 #if USE_DEBUGGER
5423  team->t.t_bar[b].b_master_arrived = 0;
5424  team->t.t_bar[b].b_team_arrived = 0;
5425 #endif
5426  }
5427  }
5428 
5429  team->t.t_proc_bind = new_proc_bind;
5430 
5431 #if OMPT_SUPPORT
5432  __ompt_team_assign_id(team, ompt_parallel_data);
5433  team->t.ompt_serialized_team_info = NULL;
5434 #endif
5435 
5436  KMP_MB();
5437 
5438  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5439  team->t.t_id));
5440 
5441  return team;
5442 }
5443 
5444 /* TODO implement hot-teams at all levels */
5445 /* TODO implement lazy thread release on demand (disband request) */
5446 
5447 /* free the team. return it to the team pool. release all the threads
5448  * associated with it */
5449 void __kmp_free_team(kmp_root_t *root,
5450  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5451  int f;
5452  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5453  team->t.t_id));
5454 
5455  /* verify state */
5456  KMP_DEBUG_ASSERT(root);
5457  KMP_DEBUG_ASSERT(team);
5458  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5459  KMP_DEBUG_ASSERT(team->t.t_threads);
5460 
5461  int use_hot_team = team == root->r.r_hot_team;
5462 #if KMP_NESTED_HOT_TEAMS
5463  int level;
5464  kmp_hot_team_ptr_t *hot_teams;
5465  if (master) {
5466  level = team->t.t_active_level - 1;
5467  if (master->th.th_teams_microtask) { // in teams construct?
5468  if (master->th.th_teams_size.nteams > 1) {
5469  ++level; // level was not increased in teams construct for
5470  // team_of_masters
5471  }
5472  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5473  master->th.th_teams_level == team->t.t_level) {
5474  ++level; // level was not increased in teams construct for
5475  // team_of_workers before the parallel
5476  } // team->t.t_level will be increased inside parallel
5477  }
5478  hot_teams = master->th.th_hot_teams;
5479  if (level < __kmp_hot_teams_max_level) {
5480  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5481  use_hot_team = 1;
5482  }
5483  }
5484 #endif // KMP_NESTED_HOT_TEAMS
5485 
5486  /* team is done working */
5487  TCW_SYNC_PTR(team->t.t_pkfn,
5488  NULL); // Important for Debugging Support Library.
5489 #if KMP_OS_WINDOWS
5490  team->t.t_copyin_counter = 0; // init counter for possible reuse
5491 #endif
5492  // Do not reset pointer to parent team to NULL for hot teams.
5493 
5494  /* if we are non-hot team, release our threads */
5495  if (!use_hot_team) {
5496  if (__kmp_tasking_mode != tskm_immediate_exec) {
5497  // Wait for threads to reach reapable state
5498  for (f = 1; f < team->t.t_nproc; ++f) {
5499  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5500  kmp_info_t *th = team->t.t_threads[f];
5501  volatile kmp_uint32 *state = &th->th.th_reap_state;
5502  while (*state != KMP_SAFE_TO_REAP) {
5503 #if KMP_OS_WINDOWS
5504  // On Windows a thread can be killed at any time, check this
5505  DWORD ecode;
5506  if (!__kmp_is_thread_alive(th, &ecode)) {
5507  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5508  break;
5509  }
5510 #endif
5511  // first check if thread is sleeping
5512  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5513  if (fl.is_sleeping())
5514  fl.resume(__kmp_gtid_from_thread(th));
5515  KMP_CPU_PAUSE();
5516  }
5517  }
5518 
5519  // Delete task teams
5520  int tt_idx;
5521  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5522  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5523  if (task_team != NULL) {
5524  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5525  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5526  team->t.t_threads[f]->th.th_task_team = NULL;
5527  }
5528  KA_TRACE(
5529  20,
5530  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5531  __kmp_get_gtid(), task_team, team->t.t_id));
5532 #if KMP_NESTED_HOT_TEAMS
5533  __kmp_free_task_team(master, task_team);
5534 #endif
5535  team->t.t_task_team[tt_idx] = NULL;
5536  }
5537  }
5538  }
5539 
5540  // Reset pointer to parent team only for non-hot teams.
5541  team->t.t_parent = NULL;
5542  team->t.t_level = 0;
5543  team->t.t_active_level = 0;
5544 
5545  /* free the worker threads */
5546  for (f = 1; f < team->t.t_nproc; ++f) {
5547  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5548  __kmp_free_thread(team->t.t_threads[f]);
5549  team->t.t_threads[f] = NULL;
5550  }
5551 
5552  /* put the team back in the team pool */
5553  /* TODO limit size of team pool, call reap_team if pool too large */
5554  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5555  __kmp_team_pool = (volatile kmp_team_t *)team;
5556  } else { // Check if team was created for the masters in a teams construct
5557  // See if first worker is a CG root
5558  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5559  team->t.t_threads[1]->th.th_cg_roots);
5560  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5561  // Clean up the CG root nodes on workers so that this team can be re-used
5562  for (f = 1; f < team->t.t_nproc; ++f) {
5563  kmp_info_t *thr = team->t.t_threads[f];
5564  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5565  thr->th.th_cg_roots->cg_root == thr);
5566  // Pop current CG root off list
5567  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5568  thr->th.th_cg_roots = tmp->up;
5569  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5570  " up to node %p. cg_nthreads was %d\n",
5571  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5572  int i = tmp->cg_nthreads--;
5573  if (i == 1) {
5574  __kmp_free(tmp); // free CG if we are the last thread in it
5575  }
5576  // Restore current task's thread_limit from CG root
5577  if (thr->th.th_cg_roots)
5578  thr->th.th_current_task->td_icvs.thread_limit =
5579  thr->th.th_cg_roots->cg_thread_limit;
5580  }
5581  }
5582  }
5583 
5584  KMP_MB();
5585 }
5586 
5587 /* reap the team. destroy it, reclaim all its resources and free its memory */
5588 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5589  kmp_team_t *next_pool = team->t.t_next_pool;
5590 
5591  KMP_DEBUG_ASSERT(team);
5592  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5593  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5594  KMP_DEBUG_ASSERT(team->t.t_threads);
5595  KMP_DEBUG_ASSERT(team->t.t_argv);
5596 
5597  /* TODO clean the threads that are a part of this? */
5598 
5599  /* free stuff */
5600  __kmp_free_team_arrays(team);
5601  if (team->t.t_argv != &team->t.t_inline_argv[0])
5602  __kmp_free((void *)team->t.t_argv);
5603  __kmp_free(team);
5604 
5605  KMP_MB();
5606  return next_pool;
5607 }
5608 
5609 // Free the thread. Don't reap it, just place it on the pool of available
5610 // threads.
5611 //
5612 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5613 // binding for the affinity mechanism to be useful.
5614 //
5615 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5616 // However, we want to avoid a potential performance problem by always
5617 // scanning through the list to find the correct point at which to insert
5618 // the thread (potential N**2 behavior). To do this we keep track of the
5619 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5620 // With single-level parallelism, threads will always be added to the tail
5621 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5622 // parallelism, all bets are off and we may need to scan through the entire
5623 // free list.
5624 //
5625 // This change also has a potentially large performance benefit, for some
5626 // applications. Previously, as threads were freed from the hot team, they
5627 // would be placed back on the free list in inverse order. If the hot team
5628 // grew back to it's original size, then the freed thread would be placed
5629 // back on the hot team in reverse order. This could cause bad cache
5630 // locality problems on programs where the size of the hot team regularly
5631 // grew and shrunk.
5632 //
5633 // Now, for single-level parallelism, the OMP tid is always == gtid.
5634 void __kmp_free_thread(kmp_info_t *this_th) {
5635  int gtid;
5636  kmp_info_t **scan;
5637 
5638  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5639  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5640 
5641  KMP_DEBUG_ASSERT(this_th);
5642 
5643  // When moving thread to pool, switch thread to wait on own b_go flag, and
5644  // uninitialized (NULL team).
5645  int b;
5646  kmp_balign_t *balign = this_th->th.th_bar;
5647  for (b = 0; b < bs_last_barrier; ++b) {
5648  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5649  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5650  balign[b].bb.team = NULL;
5651  balign[b].bb.leaf_kids = 0;
5652  }
5653  this_th->th.th_task_state = 0;
5654  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5655 
5656  /* put thread back on the free pool */
5657  TCW_PTR(this_th->th.th_team, NULL);
5658  TCW_PTR(this_th->th.th_root, NULL);
5659  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5660 
5661  while (this_th->th.th_cg_roots) {
5662  this_th->th.th_cg_roots->cg_nthreads--;
5663  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5664  " %p of thread %p to %d\n",
5665  this_th, this_th->th.th_cg_roots,
5666  this_th->th.th_cg_roots->cg_root,
5667  this_th->th.th_cg_roots->cg_nthreads));
5668  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5669  if (tmp->cg_root == this_th) { // Thread is a cg_root
5670  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5671  KA_TRACE(
5672  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5673  this_th->th.th_cg_roots = tmp->up;
5674  __kmp_free(tmp);
5675  } else { // Worker thread
5676  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5677  __kmp_free(tmp);
5678  }
5679  this_th->th.th_cg_roots = NULL;
5680  break;
5681  }
5682  }
5683 
5684  /* If the implicit task assigned to this thread can be used by other threads
5685  * -> multiple threads can share the data and try to free the task at
5686  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5687  * with higher probability when hot team is disabled but can occurs even when
5688  * the hot team is enabled */
5689  __kmp_free_implicit_task(this_th);
5690  this_th->th.th_current_task = NULL;
5691 
5692  // If the __kmp_thread_pool_insert_pt is already past the new insert
5693  // point, then we need to re-scan the entire list.
5694  gtid = this_th->th.th_info.ds.ds_gtid;
5695  if (__kmp_thread_pool_insert_pt != NULL) {
5696  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5697  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5698  __kmp_thread_pool_insert_pt = NULL;
5699  }
5700  }
5701 
5702  // Scan down the list to find the place to insert the thread.
5703  // scan is the address of a link in the list, possibly the address of
5704  // __kmp_thread_pool itself.
5705  //
5706  // In the absence of nested parallelism, the for loop will have 0 iterations.
5707  if (__kmp_thread_pool_insert_pt != NULL) {
5708  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5709  } else {
5710  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5711  }
5712  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5713  scan = &((*scan)->th.th_next_pool))
5714  ;
5715 
5716  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5717  // to its address.
5718  TCW_PTR(this_th->th.th_next_pool, *scan);
5719  __kmp_thread_pool_insert_pt = *scan = this_th;
5720  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5721  (this_th->th.th_info.ds.ds_gtid <
5722  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5723  TCW_4(this_th->th.th_in_pool, TRUE);
5724  __kmp_suspend_initialize_thread(this_th);
5725  __kmp_lock_suspend_mx(this_th);
5726  if (this_th->th.th_active == TRUE) {
5727  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5728  this_th->th.th_active_in_pool = TRUE;
5729  }
5730 #if KMP_DEBUG
5731  else {
5732  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5733  }
5734 #endif
5735  __kmp_unlock_suspend_mx(this_th);
5736 
5737  TCW_4(__kmp_nth, __kmp_nth - 1);
5738 
5739 #ifdef KMP_ADJUST_BLOCKTIME
5740  /* Adjust blocktime back to user setting or default if necessary */
5741  /* Middle initialization might never have occurred */
5742  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5743  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5744  if (__kmp_nth <= __kmp_avail_proc) {
5745  __kmp_zero_bt = FALSE;
5746  }
5747  }
5748 #endif /* KMP_ADJUST_BLOCKTIME */
5749 
5750  KMP_MB();
5751 }
5752 
5753 /* ------------------------------------------------------------------------ */
5754 
5755 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5756 #if OMP_PROFILING_SUPPORT
5757  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5758  // TODO: add a configuration option for time granularity
5759  if (ProfileTraceFile)
5760  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5761 #endif
5762 
5763  int gtid = this_thr->th.th_info.ds.ds_gtid;
5764  /* void *stack_data;*/
5765  kmp_team_t **volatile pteam;
5766 
5767  KMP_MB();
5768  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5769 
5770  if (__kmp_env_consistency_check) {
5771  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5772  }
5773 
5774 #if OMPT_SUPPORT
5775  ompt_data_t *thread_data;
5776  if (ompt_enabled.enabled) {
5777  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5778  *thread_data = ompt_data_none;
5779 
5780  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5781  this_thr->th.ompt_thread_info.wait_id = 0;
5782  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5783  this_thr->th.ompt_thread_info.parallel_flags = 0;
5784  if (ompt_enabled.ompt_callback_thread_begin) {
5785  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5786  ompt_thread_worker, thread_data);
5787  }
5788  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5789  }
5790 #endif
5791 
5792  /* This is the place where threads wait for work */
5793  while (!TCR_4(__kmp_global.g.g_done)) {
5794  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5795  KMP_MB();
5796 
5797  /* wait for work to do */
5798  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5799 
5800  /* No tid yet since not part of a team */
5801  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5802 
5803 #if OMPT_SUPPORT
5804  if (ompt_enabled.enabled) {
5805  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5806  }
5807 #endif
5808 
5809  pteam = &this_thr->th.th_team;
5810 
5811  /* have we been allocated? */
5812  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5813  /* we were just woken up, so run our new task */
5814  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5815  int rc;
5816  KA_TRACE(20,
5817  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5818  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5819  (*pteam)->t.t_pkfn));
5820 
5821  updateHWFPControl(*pteam);
5822 
5823 #if OMPT_SUPPORT
5824  if (ompt_enabled.enabled) {
5825  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5826  }
5827 #endif
5828 
5829  rc = (*pteam)->t.t_invoke(gtid);
5830  KMP_ASSERT(rc);
5831 
5832  KMP_MB();
5833  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5834  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5835  (*pteam)->t.t_pkfn));
5836  }
5837 #if OMPT_SUPPORT
5838  if (ompt_enabled.enabled) {
5839  /* no frame set while outside task */
5840  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5841 
5842  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5843  }
5844 #endif
5845  /* join barrier after parallel region */
5846  __kmp_join_barrier(gtid);
5847  }
5848  }
5849  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5850 
5851 #if OMPT_SUPPORT
5852  if (ompt_enabled.ompt_callback_thread_end) {
5853  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5854  }
5855 #endif
5856 
5857  this_thr->th.th_task_team = NULL;
5858  /* run the destructors for the threadprivate data for this thread */
5859  __kmp_common_destroy_gtid(gtid);
5860 
5861  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5862  KMP_MB();
5863 
5864 #if OMP_PROFILING_SUPPORT
5865  llvm::timeTraceProfilerFinishThread();
5866 #endif
5867  return this_thr;
5868 }
5869 
5870 /* ------------------------------------------------------------------------ */
5871 
5872 void __kmp_internal_end_dest(void *specific_gtid) {
5873  // Make sure no significant bits are lost
5874  int gtid;
5875  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5876 
5877  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5878  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5879  * this is because 0 is reserved for the nothing-stored case */
5880 
5881  __kmp_internal_end_thread(gtid);
5882 }
5883 
5884 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5885 
5886 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5887  __kmp_internal_end_atexit();
5888 }
5889 
5890 #endif
5891 
5892 /* [Windows] josh: when the atexit handler is called, there may still be more
5893  than one thread alive */
5894 void __kmp_internal_end_atexit(void) {
5895  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5896  /* [Windows]
5897  josh: ideally, we want to completely shutdown the library in this atexit
5898  handler, but stat code that depends on thread specific data for gtid fails
5899  because that data becomes unavailable at some point during the shutdown, so
5900  we call __kmp_internal_end_thread instead. We should eventually remove the
5901  dependency on __kmp_get_specific_gtid in the stat code and use
5902  __kmp_internal_end_library to cleanly shutdown the library.
5903 
5904  // TODO: Can some of this comment about GVS be removed?
5905  I suspect that the offending stat code is executed when the calling thread
5906  tries to clean up a dead root thread's data structures, resulting in GVS
5907  code trying to close the GVS structures for that thread, but since the stat
5908  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5909  the calling thread is cleaning up itself instead of another thread, it get
5910  confused. This happens because allowing a thread to unregister and cleanup
5911  another thread is a recent modification for addressing an issue.
5912  Based on the current design (20050722), a thread may end up
5913  trying to unregister another thread only if thread death does not trigger
5914  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5915  thread specific data destructor function to detect thread death. For
5916  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5917  is nothing. Thus, the workaround is applicable only for Windows static
5918  stat library. */
5919  __kmp_internal_end_library(-1);
5920 #if KMP_OS_WINDOWS
5921  __kmp_close_console();
5922 #endif
5923 }
5924 
5925 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5926  // It is assumed __kmp_forkjoin_lock is acquired.
5927 
5928  int gtid;
5929 
5930  KMP_DEBUG_ASSERT(thread != NULL);
5931 
5932  gtid = thread->th.th_info.ds.ds_gtid;
5933 
5934  if (!is_root) {
5935  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5936  /* Assume the threads are at the fork barrier here */
5937  KA_TRACE(
5938  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5939  gtid));
5940  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5941  * (GEH) */
5942  ANNOTATE_HAPPENS_BEFORE(thread);
5943  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5944  thread);
5945  __kmp_release_64(&flag);
5946  }
5947 
5948  // Terminate OS thread.
5949  __kmp_reap_worker(thread);
5950 
5951  // The thread was killed asynchronously. If it was actively
5952  // spinning in the thread pool, decrement the global count.
5953  //
5954  // There is a small timing hole here - if the worker thread was just waking
5955  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5956  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5957  // the global counter might not get updated.
5958  //
5959  // Currently, this can only happen as the library is unloaded,
5960  // so there are no harmful side effects.
5961  if (thread->th.th_active_in_pool) {
5962  thread->th.th_active_in_pool = FALSE;
5963  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5964  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5965  }
5966  }
5967 
5968  __kmp_free_implicit_task(thread);
5969 
5970 // Free the fast memory for tasking
5971 #if USE_FAST_MEMORY
5972  __kmp_free_fast_memory(thread);
5973 #endif /* USE_FAST_MEMORY */
5974 
5975  __kmp_suspend_uninitialize_thread(thread);
5976 
5977  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5978  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5979 
5980  --__kmp_all_nth;
5981 // __kmp_nth was decremented when thread is added to the pool.
5982 
5983 #ifdef KMP_ADJUST_BLOCKTIME
5984  /* Adjust blocktime back to user setting or default if necessary */
5985  /* Middle initialization might never have occurred */
5986  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5987  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5988  if (__kmp_nth <= __kmp_avail_proc) {
5989  __kmp_zero_bt = FALSE;
5990  }
5991  }
5992 #endif /* KMP_ADJUST_BLOCKTIME */
5993 
5994  /* free the memory being used */
5995  if (__kmp_env_consistency_check) {
5996  if (thread->th.th_cons) {
5997  __kmp_free_cons_stack(thread->th.th_cons);
5998  thread->th.th_cons = NULL;
5999  }
6000  }
6001 
6002  if (thread->th.th_pri_common != NULL) {
6003  __kmp_free(thread->th.th_pri_common);
6004  thread->th.th_pri_common = NULL;
6005  }
6006 
6007  if (thread->th.th_task_state_memo_stack != NULL) {
6008  __kmp_free(thread->th.th_task_state_memo_stack);
6009  thread->th.th_task_state_memo_stack = NULL;
6010  }
6011 
6012 #if KMP_USE_BGET
6013  if (thread->th.th_local.bget_data != NULL) {
6014  __kmp_finalize_bget(thread);
6015  }
6016 #endif
6017 
6018 #if KMP_AFFINITY_SUPPORTED
6019  if (thread->th.th_affin_mask != NULL) {
6020  KMP_CPU_FREE(thread->th.th_affin_mask);
6021  thread->th.th_affin_mask = NULL;
6022  }
6023 #endif /* KMP_AFFINITY_SUPPORTED */
6024 
6025 #if KMP_USE_HIER_SCHED
6026  if (thread->th.th_hier_bar_data != NULL) {
6027  __kmp_free(thread->th.th_hier_bar_data);
6028  thread->th.th_hier_bar_data = NULL;
6029  }
6030 #endif
6031 
6032  __kmp_reap_team(thread->th.th_serial_team);
6033  thread->th.th_serial_team = NULL;
6034  __kmp_free(thread);
6035 
6036  KMP_MB();
6037 
6038 } // __kmp_reap_thread
6039 
6040 static void __kmp_internal_end(void) {
6041  int i;
6042 
6043  /* First, unregister the library */
6044  __kmp_unregister_library();
6045 
6046 #if KMP_OS_WINDOWS
6047  /* In Win static library, we can't tell when a root actually dies, so we
6048  reclaim the data structures for any root threads that have died but not
6049  unregistered themselves, in order to shut down cleanly.
6050  In Win dynamic library we also can't tell when a thread dies. */
6051  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6052 // dead roots
6053 #endif
6054 
6055  for (i = 0; i < __kmp_threads_capacity; i++)
6056  if (__kmp_root[i])
6057  if (__kmp_root[i]->r.r_active)
6058  break;
6059  KMP_MB(); /* Flush all pending memory write invalidates. */
6060  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6061 
6062  if (i < __kmp_threads_capacity) {
6063 #if KMP_USE_MONITOR
6064  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6065  KMP_MB(); /* Flush all pending memory write invalidates. */
6066 
6067  // Need to check that monitor was initialized before reaping it. If we are
6068  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6069  // __kmp_monitor will appear to contain valid data, but it is only valid in
6070  // the parent process, not the child.
6071  // New behavior (201008): instead of keying off of the flag
6072  // __kmp_init_parallel, the monitor thread creation is keyed off
6073  // of the new flag __kmp_init_monitor.
6074  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6075  if (TCR_4(__kmp_init_monitor)) {
6076  __kmp_reap_monitor(&__kmp_monitor);
6077  TCW_4(__kmp_init_monitor, 0);
6078  }
6079  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6080  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6081 #endif // KMP_USE_MONITOR
6082  } else {
6083 /* TODO move this to cleanup code */
6084 #ifdef KMP_DEBUG
6085  /* make sure that everything has properly ended */
6086  for (i = 0; i < __kmp_threads_capacity; i++) {
6087  if (__kmp_root[i]) {
6088  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6089  // there can be uber threads alive here
6090  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6091  }
6092  }
6093 #endif
6094 
6095  KMP_MB();
6096 
6097  // Reap the worker threads.
6098  // This is valid for now, but be careful if threads are reaped sooner.
6099  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6100  // Get the next thread from the pool.
6101  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6102  __kmp_thread_pool = thread->th.th_next_pool;
6103  // Reap it.
6104  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6105  thread->th.th_next_pool = NULL;
6106  thread->th.th_in_pool = FALSE;
6107  __kmp_reap_thread(thread, 0);
6108  }
6109  __kmp_thread_pool_insert_pt = NULL;
6110 
6111  // Reap teams.
6112  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6113  // Get the next team from the pool.
6114  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6115  __kmp_team_pool = team->t.t_next_pool;
6116  // Reap it.
6117  team->t.t_next_pool = NULL;
6118  __kmp_reap_team(team);
6119  }
6120 
6121  __kmp_reap_task_teams();
6122 
6123 #if KMP_OS_UNIX
6124  // Threads that are not reaped should not access any resources since they
6125  // are going to be deallocated soon, so the shutdown sequence should wait
6126  // until all threads either exit the final spin-waiting loop or begin
6127  // sleeping after the given blocktime.
6128  for (i = 0; i < __kmp_threads_capacity; i++) {
6129  kmp_info_t *thr = __kmp_threads[i];
6130  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6131  KMP_CPU_PAUSE();
6132  }
6133 #endif
6134 
6135  for (i = 0; i < __kmp_threads_capacity; ++i) {
6136  // TBD: Add some checking...
6137  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6138  }
6139 
6140  /* Make sure all threadprivate destructors get run by joining with all
6141  worker threads before resetting this flag */
6142  TCW_SYNC_4(__kmp_init_common, FALSE);
6143 
6144  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6145  KMP_MB();
6146 
6147 #if KMP_USE_MONITOR
6148  // See note above: One of the possible fixes for CQ138434 / CQ140126
6149  //
6150  // FIXME: push both code fragments down and CSE them?
6151  // push them into __kmp_cleanup() ?
6152  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6153  if (TCR_4(__kmp_init_monitor)) {
6154  __kmp_reap_monitor(&__kmp_monitor);
6155  TCW_4(__kmp_init_monitor, 0);
6156  }
6157  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6158  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6159 #endif
6160  } /* else !__kmp_global.t_active */
6161  TCW_4(__kmp_init_gtid, FALSE);
6162  KMP_MB(); /* Flush all pending memory write invalidates. */
6163 
6164  __kmp_cleanup();
6165 #if OMPT_SUPPORT
6166  ompt_fini();
6167 #endif
6168 }
6169 
6170 void __kmp_internal_end_library(int gtid_req) {
6171  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6172  /* this shouldn't be a race condition because __kmp_internal_end() is the
6173  only place to clear __kmp_serial_init */
6174  /* we'll check this later too, after we get the lock */
6175  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6176  // redundant, because the next check will work in any case.
6177  if (__kmp_global.g.g_abort) {
6178  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6179  /* TODO abort? */
6180  return;
6181  }
6182  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6183  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6184  return;
6185  }
6186 
6187  KMP_MB(); /* Flush all pending memory write invalidates. */
6188  /* find out who we are and what we should do */
6189  {
6190  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6191  KA_TRACE(
6192  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6193  if (gtid == KMP_GTID_SHUTDOWN) {
6194  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6195  "already shutdown\n"));
6196  return;
6197  } else if (gtid == KMP_GTID_MONITOR) {
6198  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6199  "registered, or system shutdown\n"));
6200  return;
6201  } else if (gtid == KMP_GTID_DNE) {
6202  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6203  "shutdown\n"));
6204  /* we don't know who we are, but we may still shutdown the library */
6205  } else if (KMP_UBER_GTID(gtid)) {
6206  /* unregister ourselves as an uber thread. gtid is no longer valid */
6207  if (__kmp_root[gtid]->r.r_active) {
6208  __kmp_global.g.g_abort = -1;
6209  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6210  __kmp_unregister_library();
6211  KA_TRACE(10,
6212  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6213  gtid));
6214  return;
6215  } else {
6216  KA_TRACE(
6217  10,
6218  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6219  __kmp_unregister_root_current_thread(gtid);
6220  }
6221  } else {
6222 /* worker threads may call this function through the atexit handler, if they
6223  * call exit() */
6224 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6225  TODO: do a thorough shutdown instead */
6226 #ifdef DUMP_DEBUG_ON_EXIT
6227  if (__kmp_debug_buf)
6228  __kmp_dump_debug_buffer();
6229 #endif
6230  // added unregister library call here when we switch to shm linux
6231  // if we don't, it will leave lots of files in /dev/shm
6232  // cleanup shared memory file before exiting.
6233  __kmp_unregister_library();
6234  return;
6235  }
6236  }
6237  /* synchronize the termination process */
6238  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6239 
6240  /* have we already finished */
6241  if (__kmp_global.g.g_abort) {
6242  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6243  /* TODO abort? */
6244  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6245  return;
6246  }
6247  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6248  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6249  return;
6250  }
6251 
6252  /* We need this lock to enforce mutex between this reading of
6253  __kmp_threads_capacity and the writing by __kmp_register_root.
6254  Alternatively, we can use a counter of roots that is atomically updated by
6255  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6256  __kmp_internal_end_*. */
6257  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6258 
6259  /* now we can safely conduct the actual termination */
6260  __kmp_internal_end();
6261 
6262  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6263  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6264 
6265  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6266 
6267 #ifdef DUMP_DEBUG_ON_EXIT
6268  if (__kmp_debug_buf)
6269  __kmp_dump_debug_buffer();
6270 #endif
6271 
6272 #if KMP_OS_WINDOWS
6273  __kmp_close_console();
6274 #endif
6275 
6276  __kmp_fini_allocator();
6277 
6278 } // __kmp_internal_end_library
6279 
6280 void __kmp_internal_end_thread(int gtid_req) {
6281  int i;
6282 
6283  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6284  /* this shouldn't be a race condition because __kmp_internal_end() is the
6285  * only place to clear __kmp_serial_init */
6286  /* we'll check this later too, after we get the lock */
6287  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6288  // redundant, because the next check will work in any case.
6289  if (__kmp_global.g.g_abort) {
6290  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6291  /* TODO abort? */
6292  return;
6293  }
6294  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6295  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6296  return;
6297  }
6298 
6299  // If hidden helper team has been initialized, we need to deinit it
6300  if (TCR_4(__kmp_init_hidden_helper)) {
6301  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6302  // First release the main thread to let it continue its work
6303  __kmp_hidden_helper_main_thread_release();
6304  // Wait until the hidden helper team has been destroyed
6305  __kmp_hidden_helper_threads_deinitz_wait();
6306  }
6307 
6308  KMP_MB(); /* Flush all pending memory write invalidates. */
6309 
6310  /* find out who we are and what we should do */
6311  {
6312  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6313  KA_TRACE(10,
6314  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6315  if (gtid == KMP_GTID_SHUTDOWN) {
6316  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6317  "already shutdown\n"));
6318  return;
6319  } else if (gtid == KMP_GTID_MONITOR) {
6320  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6321  "registered, or system shutdown\n"));
6322  return;
6323  } else if (gtid == KMP_GTID_DNE) {
6324  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6325  "shutdown\n"));
6326  return;
6327  /* we don't know who we are */
6328  } else if (KMP_UBER_GTID(gtid)) {
6329  /* unregister ourselves as an uber thread. gtid is no longer valid */
6330  if (__kmp_root[gtid]->r.r_active) {
6331  __kmp_global.g.g_abort = -1;
6332  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333  KA_TRACE(10,
6334  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6335  gtid));
6336  return;
6337  } else {
6338  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6339  gtid));
6340  __kmp_unregister_root_current_thread(gtid);
6341  }
6342  } else {
6343  /* just a worker thread, let's leave */
6344  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6345 
6346  if (gtid >= 0) {
6347  __kmp_threads[gtid]->th.th_task_team = NULL;
6348  }
6349 
6350  KA_TRACE(10,
6351  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6352  gtid));
6353  return;
6354  }
6355  }
6356 #if KMP_DYNAMIC_LIB
6357  if (__kmp_pause_status != kmp_hard_paused)
6358  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6359  // because we will better shutdown later in the library destructor.
6360  {
6361  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6362  return;
6363  }
6364 #endif
6365  /* synchronize the termination process */
6366  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6367 
6368  /* have we already finished */
6369  if (__kmp_global.g.g_abort) {
6370  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6371  /* TODO abort? */
6372  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6373  return;
6374  }
6375  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6376  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6377  return;
6378  }
6379 
6380  /* We need this lock to enforce mutex between this reading of
6381  __kmp_threads_capacity and the writing by __kmp_register_root.
6382  Alternatively, we can use a counter of roots that is atomically updated by
6383  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6384  __kmp_internal_end_*. */
6385 
6386  /* should we finish the run-time? are all siblings done? */
6387  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6388 
6389  for (i = 0; i < __kmp_threads_capacity; ++i) {
6390  if (KMP_UBER_GTID(i)) {
6391  KA_TRACE(
6392  10,
6393  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6394  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6395  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6396  return;
6397  }
6398  }
6399 
6400  /* now we can safely conduct the actual termination */
6401 
6402  __kmp_internal_end();
6403 
6404  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6405  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6406 
6407  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6408 
6409 #ifdef DUMP_DEBUG_ON_EXIT
6410  if (__kmp_debug_buf)
6411  __kmp_dump_debug_buffer();
6412 #endif
6413 } // __kmp_internal_end_thread
6414 
6415 // -----------------------------------------------------------------------------
6416 // Library registration stuff.
6417 
6418 static long __kmp_registration_flag = 0;
6419 // Random value used to indicate library initialization.
6420 static char *__kmp_registration_str = NULL;
6421 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6422 
6423 static inline char *__kmp_reg_status_name() {
6424 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6425  each thread. If registration and unregistration go in different threads
6426  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6427  env var can not be found, because the name will contain different pid. */
6428 // macOS* complains about name being too long with additional getuid()
6429 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6430  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6431  (int)getuid());
6432 #else
6433  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6434 #endif
6435 } // __kmp_reg_status_get
6436 
6437 void __kmp_register_library_startup(void) {
6438 
6439  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6440  int done = 0;
6441  union {
6442  double dtime;
6443  long ltime;
6444  } time;
6445 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6446  __kmp_initialize_system_tick();
6447 #endif
6448  __kmp_read_system_time(&time.dtime);
6449  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6450  __kmp_registration_str =
6451  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6452  __kmp_registration_flag, KMP_LIBRARY_FILE);
6453 
6454  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6455  __kmp_registration_str));
6456 
6457  while (!done) {
6458 
6459  char *value = NULL; // Actual value of the environment variable.
6460 
6461 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6462  char *shm_name = __kmp_str_format("/%s", name);
6463  int shm_preexist = 0;
6464  char *data1;
6465  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6466  if ((fd1 == -1) && (errno == EEXIST)) {
6467  // file didn't open because it already exists.
6468  // try opening existing file
6469  fd1 = shm_open(shm_name, O_RDWR, 0666);
6470  if (fd1 == -1) { // file didn't open
6471  // error out here
6472  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6473  __kmp_msg_null);
6474  } else {
6475  // able to open existing file
6476  shm_preexist = 1;
6477  }
6478  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6479  // already exists.
6480  // error out here.
6481  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6482  __kmp_msg_null);
6483  }
6484  if (shm_preexist == 0) {
6485  // we created SHM now set size
6486  if (ftruncate(fd1, SHM_SIZE) == -1) {
6487  // error occured setting size;
6488  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6489  KMP_ERR(errno), __kmp_msg_null);
6490  }
6491  }
6492  data1 =
6493  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6494  if (data1 == MAP_FAILED) {
6495  // failed to map shared memory
6496  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6497  __kmp_msg_null);
6498  }
6499  if (shm_preexist == 0) { // set data to SHM, set value
6500  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6501  }
6502  // Read value from either what we just wrote or existing file.
6503  value = __kmp_str_format("%s", data1); // read value from SHM
6504  munmap(data1, SHM_SIZE);
6505  close(fd1);
6506 #else // Windows and unix with static library
6507  // Set environment variable, but do not overwrite if it is exist.
6508  __kmp_env_set(name, __kmp_registration_str, 0);
6509  // read value to see if it got set
6510  value = __kmp_env_get(name);
6511 #endif
6512 
6513  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6514  done = 1; // Ok, environment variable set successfully, exit the loop.
6515  } else {
6516  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6517  // Check whether it alive or dead.
6518  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6519  char *tail = value;
6520  char *flag_addr_str = NULL;
6521  char *flag_val_str = NULL;
6522  char const *file_name = NULL;
6523  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6524  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6525  file_name = tail;
6526  if (tail != NULL) {
6527  long *flag_addr = 0;
6528  long flag_val = 0;
6529  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6530  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6531  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6532  // First, check whether environment-encoded address is mapped into
6533  // addr space.
6534  // If so, dereference it to see if it still has the right value.
6535  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6536  neighbor = 1;
6537  } else {
6538  // If not, then we know the other copy of the library is no longer
6539  // running.
6540  neighbor = 2;
6541  }
6542  }
6543  }
6544  switch (neighbor) {
6545  case 0: // Cannot parse environment variable -- neighbor status unknown.
6546  // Assume it is the incompatible format of future version of the
6547  // library. Assume the other library is alive.
6548  // WARN( ... ); // TODO: Issue a warning.
6549  file_name = "unknown library";
6550  KMP_FALLTHROUGH();
6551  // Attention! Falling to the next case. That's intentional.
6552  case 1: { // Neighbor is alive.
6553  // Check it is allowed.
6554  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6555  if (!__kmp_str_match_true(duplicate_ok)) {
6556  // That's not allowed. Issue fatal error.
6557  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6558  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6559  }
6560  KMP_INTERNAL_FREE(duplicate_ok);
6561  __kmp_duplicate_library_ok = 1;
6562  done = 1; // Exit the loop.
6563  } break;
6564  case 2: { // Neighbor is dead.
6565 
6566 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6567  // close shared memory.
6568  shm_unlink(shm_name); // this removes file in /dev/shm
6569 #else
6570  // Clear the variable and try to register library again.
6571  __kmp_env_unset(name);
6572 #endif
6573  } break;
6574  default: { KMP_DEBUG_ASSERT(0); } break;
6575  }
6576  }
6577  KMP_INTERNAL_FREE((void *)value);
6578 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6579  KMP_INTERNAL_FREE((void *)shm_name);
6580 #endif
6581  } // while
6582  KMP_INTERNAL_FREE((void *)name);
6583 
6584 } // func __kmp_register_library_startup
6585 
6586 void __kmp_unregister_library(void) {
6587 
6588  char *name = __kmp_reg_status_name();
6589  char *value = NULL;
6590 
6591 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6592  char *shm_name = __kmp_str_format("/%s", name);
6593  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6594  if (fd1 == -1) {
6595  // file did not open. return.
6596  return;
6597  }
6598  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6599  if (data1 != MAP_FAILED) {
6600  value = __kmp_str_format("%s", data1); // read value from SHM
6601  munmap(data1, SHM_SIZE);
6602  }
6603  close(fd1);
6604 #else
6605  value = __kmp_env_get(name);
6606 #endif
6607 
6608  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6609  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6610  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6611 // Ok, this is our variable. Delete it.
6612 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6613  shm_unlink(shm_name); // this removes file in /dev/shm
6614 #else
6615  __kmp_env_unset(name);
6616 #endif
6617  }
6618 
6619 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6620  KMP_INTERNAL_FREE(shm_name);
6621 #endif
6622 
6623  KMP_INTERNAL_FREE(__kmp_registration_str);
6624  KMP_INTERNAL_FREE(value);
6625  KMP_INTERNAL_FREE(name);
6626 
6627  __kmp_registration_flag = 0;
6628  __kmp_registration_str = NULL;
6629 
6630 } // __kmp_unregister_library
6631 
6632 // End of Library registration stuff.
6633 // -----------------------------------------------------------------------------
6634 
6635 #if KMP_MIC_SUPPORTED
6636 
6637 static void __kmp_check_mic_type() {
6638  kmp_cpuid_t cpuid_state = {0};
6639  kmp_cpuid_t *cs_p = &cpuid_state;
6640  __kmp_x86_cpuid(1, 0, cs_p);
6641  // We don't support mic1 at the moment
6642  if ((cs_p->eax & 0xff0) == 0xB10) {
6643  __kmp_mic_type = mic2;
6644  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6645  __kmp_mic_type = mic3;
6646  } else {
6647  __kmp_mic_type = non_mic;
6648  }
6649 }
6650 
6651 #endif /* KMP_MIC_SUPPORTED */
6652 
6653 #if KMP_HAVE_UMWAIT
6654 static void __kmp_user_level_mwait_init() {
6655  struct kmp_cpuid buf;
6656  __kmp_x86_cpuid(7, 0, &buf);
6657  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6658  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6659  __kmp_umwait_enabled));
6660 }
6661 #elif KMP_HAVE_MWAIT
6662 #ifndef AT_INTELPHIUSERMWAIT
6663 // Spurious, non-existent value that should always fail to return anything.
6664 // Will be replaced with the correct value when we know that.
6665 #define AT_INTELPHIUSERMWAIT 10000
6666 #endif
6667 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6668 // earlier OS is used to build the RTL, we'll use the following internal
6669 // function when the entry is not found.
6670 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6671 unsigned long getauxval(unsigned long) { return 0; }
6672 
6673 static void __kmp_user_level_mwait_init() {
6674  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6675  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6676  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6677  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6678  if (__kmp_mic_type == mic3) {
6679  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6680  if ((res & 0x1) || __kmp_user_level_mwait) {
6681  __kmp_mwait_enabled = TRUE;
6682  if (__kmp_user_level_mwait) {
6683  KMP_INFORM(EnvMwaitWarn);
6684  }
6685  } else {
6686  __kmp_mwait_enabled = FALSE;
6687  }
6688  }
6689  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6690  "__kmp_mwait_enabled = %d\n",
6691  __kmp_mic_type, __kmp_mwait_enabled));
6692 }
6693 #endif /* KMP_HAVE_UMWAIT */
6694 
6695 static void __kmp_do_serial_initialize(void) {
6696  int i, gtid;
6697  size_t size;
6698 
6699  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6700 
6701  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6702  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6703  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6704  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6705  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6706 
6707 #if OMPT_SUPPORT
6708  ompt_pre_init();
6709 #endif
6710 
6711  __kmp_validate_locks();
6712 
6713  /* Initialize internal memory allocator */
6714  __kmp_init_allocator();
6715 
6716  /* Register the library startup via an environment variable and check to see
6717  whether another copy of the library is already registered. */
6718 
6719  __kmp_register_library_startup();
6720 
6721  /* TODO reinitialization of library */
6722  if (TCR_4(__kmp_global.g.g_done)) {
6723  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6724  }
6725 
6726  __kmp_global.g.g_abort = 0;
6727  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6728 
6729 /* initialize the locks */
6730 #if KMP_USE_ADAPTIVE_LOCKS
6731 #if KMP_DEBUG_ADAPTIVE_LOCKS
6732  __kmp_init_speculative_stats();
6733 #endif
6734 #endif
6735 #if KMP_STATS_ENABLED
6736  __kmp_stats_init();
6737 #endif
6738  __kmp_init_lock(&__kmp_global_lock);
6739  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6740  __kmp_init_lock(&__kmp_debug_lock);
6741  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6742  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6743  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6744  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6745  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6746  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6747  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6748  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6749  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6750  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6751  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6752  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6753  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6754  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6755  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6756 #if KMP_USE_MONITOR
6757  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6758 #endif
6759  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6760 
6761  /* conduct initialization and initial setup of configuration */
6762 
6763  __kmp_runtime_initialize();
6764 
6765 #if KMP_MIC_SUPPORTED
6766  __kmp_check_mic_type();
6767 #endif
6768 
6769 // Some global variable initialization moved here from kmp_env_initialize()
6770 #ifdef KMP_DEBUG
6771  kmp_diag = 0;
6772 #endif
6773  __kmp_abort_delay = 0;
6774 
6775  // From __kmp_init_dflt_team_nth()
6776  /* assume the entire machine will be used */
6777  __kmp_dflt_team_nth_ub = __kmp_xproc;
6778  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6779  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6780  }
6781  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6782  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6783  }
6784  __kmp_max_nth = __kmp_sys_max_nth;
6785  __kmp_cg_max_nth = __kmp_sys_max_nth;
6786  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6787  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6788  __kmp_teams_max_nth = __kmp_sys_max_nth;
6789  }
6790 
6791  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6792  // part
6793  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6794 #if KMP_USE_MONITOR
6795  __kmp_monitor_wakeups =
6796  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6797  __kmp_bt_intervals =
6798  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6799 #endif
6800  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6801  __kmp_library = library_throughput;
6802  // From KMP_SCHEDULE initialization
6803  __kmp_static = kmp_sch_static_balanced;
6804 // AC: do not use analytical here, because it is non-monotonous
6805 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6806 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6807 // need to repeat assignment
6808 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6809 // bit control and barrier method control parts
6810 #if KMP_FAST_REDUCTION_BARRIER
6811 #define kmp_reduction_barrier_gather_bb ((int)1)
6812 #define kmp_reduction_barrier_release_bb ((int)1)
6813 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6814 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6815 #endif // KMP_FAST_REDUCTION_BARRIER
6816  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6817  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6818  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6819  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6820  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6821 #if KMP_FAST_REDUCTION_BARRIER
6822  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6823  // lin_64 ): hyper,1
6824  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6825  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6826  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6827  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6828  }
6829 #endif // KMP_FAST_REDUCTION_BARRIER
6830  }
6831 #if KMP_FAST_REDUCTION_BARRIER
6832 #undef kmp_reduction_barrier_release_pat
6833 #undef kmp_reduction_barrier_gather_pat
6834 #undef kmp_reduction_barrier_release_bb
6835 #undef kmp_reduction_barrier_gather_bb
6836 #endif // KMP_FAST_REDUCTION_BARRIER
6837 #if KMP_MIC_SUPPORTED
6838  if (__kmp_mic_type == mic2) { // KNC
6839  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6840  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6841  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6842  1; // forkjoin release
6843  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6844  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6845  }
6846 #if KMP_FAST_REDUCTION_BARRIER
6847  if (__kmp_mic_type == mic2) { // KNC
6848  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6849  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6850  }
6851 #endif // KMP_FAST_REDUCTION_BARRIER
6852 #endif // KMP_MIC_SUPPORTED
6853 
6854 // From KMP_CHECKS initialization
6855 #ifdef KMP_DEBUG
6856  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6857 #else
6858  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6859 #endif
6860 
6861  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6862  __kmp_foreign_tp = TRUE;
6863 
6864  __kmp_global.g.g_dynamic = FALSE;
6865  __kmp_global.g.g_dynamic_mode = dynamic_default;
6866 
6867  __kmp_env_initialize(NULL);
6868 
6869 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6870  __kmp_user_level_mwait_init();
6871 #endif
6872 // Print all messages in message catalog for testing purposes.
6873 #ifdef KMP_DEBUG
6874  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6875  if (__kmp_str_match_true(val)) {
6876  kmp_str_buf_t buffer;
6877  __kmp_str_buf_init(&buffer);
6878  __kmp_i18n_dump_catalog(&buffer);
6879  __kmp_printf("%s", buffer.str);
6880  __kmp_str_buf_free(&buffer);
6881  }
6882  __kmp_env_free(&val);
6883 #endif
6884 
6885  __kmp_threads_capacity =
6886  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6887  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6888  __kmp_tp_capacity = __kmp_default_tp_capacity(
6889  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6890 
6891  // If the library is shut down properly, both pools must be NULL. Just in
6892  // case, set them to NULL -- some memory may leak, but subsequent code will
6893  // work even if pools are not freed.
6894  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6895  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6896  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6897  __kmp_thread_pool = NULL;
6898  __kmp_thread_pool_insert_pt = NULL;
6899  __kmp_team_pool = NULL;
6900 
6901  /* Allocate all of the variable sized records */
6902  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6903  * expandable */
6904  /* Since allocation is cache-aligned, just add extra padding at the end */
6905  size =
6906  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6907  CACHE_LINE;
6908  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6909  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6910  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6911 
6912  /* init thread counts */
6913  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6914  0); // Asserts fail if the library is reinitializing and
6915  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6916  __kmp_all_nth = 0;
6917  __kmp_nth = 0;
6918 
6919  /* setup the uber master thread and hierarchy */
6920  gtid = __kmp_register_root(TRUE);
6921  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6922  KMP_ASSERT(KMP_UBER_GTID(gtid));
6923  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6924 
6925  KMP_MB(); /* Flush all pending memory write invalidates. */
6926 
6927  __kmp_common_initialize();
6928 
6929 #if KMP_OS_UNIX
6930  /* invoke the child fork handler */
6931  __kmp_register_atfork();
6932 #endif
6933 
6934 #if !KMP_DYNAMIC_LIB
6935  {
6936  /* Invoke the exit handler when the program finishes, only for static
6937  library. For dynamic library, we already have _fini and DllMain. */
6938  int rc = atexit(__kmp_internal_end_atexit);
6939  if (rc != 0) {
6940  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6941  __kmp_msg_null);
6942  }
6943  }
6944 #endif
6945 
6946 #if KMP_HANDLE_SIGNALS
6947 #if KMP_OS_UNIX
6948  /* NOTE: make sure that this is called before the user installs their own
6949  signal handlers so that the user handlers are called first. this way they
6950  can return false, not call our handler, avoid terminating the library, and
6951  continue execution where they left off. */
6952  __kmp_install_signals(FALSE);
6953 #endif /* KMP_OS_UNIX */
6954 #if KMP_OS_WINDOWS
6955  __kmp_install_signals(TRUE);
6956 #endif /* KMP_OS_WINDOWS */
6957 #endif
6958 
6959  /* we have finished the serial initialization */
6960  __kmp_init_counter++;
6961 
6962  __kmp_init_serial = TRUE;
6963 
6964  if (__kmp_settings) {
6965  __kmp_env_print();
6966  }
6967 
6968  if (__kmp_display_env || __kmp_display_env_verbose) {
6969  __kmp_env_print_2();
6970  }
6971 
6972 #if OMPT_SUPPORT
6973  ompt_post_init();
6974 #endif
6975 
6976  KMP_MB();
6977 
6978  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6979 }
6980 
6981 void __kmp_serial_initialize(void) {
6982  if (__kmp_init_serial) {
6983  return;
6984  }
6985  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6986  if (__kmp_init_serial) {
6987  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6988  return;
6989  }
6990  __kmp_do_serial_initialize();
6991  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6992 }
6993 
6994 static void __kmp_do_middle_initialize(void) {
6995  int i, j;
6996  int prev_dflt_team_nth;
6997 
6998  if (!__kmp_init_serial) {
6999  __kmp_do_serial_initialize();
7000  }
7001 
7002  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7003 
7004  // Save the previous value for the __kmp_dflt_team_nth so that
7005  // we can avoid some reinitialization if it hasn't changed.
7006  prev_dflt_team_nth = __kmp_dflt_team_nth;
7007 
7008 #if KMP_AFFINITY_SUPPORTED
7009  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7010  // number of cores on the machine.
7011  __kmp_affinity_initialize();
7012 
7013  // Run through the __kmp_threads array and set the affinity mask
7014  // for each root thread that is currently registered with the RTL.
7015  for (i = 0; i < __kmp_threads_capacity; i++) {
7016  if (TCR_PTR(__kmp_threads[i]) != NULL) {
7017  __kmp_affinity_set_init_mask(i, TRUE);
7018  }
7019  }
7020 #endif /* KMP_AFFINITY_SUPPORTED */
7021 
7022  KMP_ASSERT(__kmp_xproc > 0);
7023  if (__kmp_avail_proc == 0) {
7024  __kmp_avail_proc = __kmp_xproc;
7025  }
7026 
7027  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7028  // correct them now
7029  j = 0;
7030  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7031  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7032  __kmp_avail_proc;
7033  j++;
7034  }
7035 
7036  if (__kmp_dflt_team_nth == 0) {
7037 #ifdef KMP_DFLT_NTH_CORES
7038  // Default #threads = #cores
7039  __kmp_dflt_team_nth = __kmp_ncores;
7040  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7041  "__kmp_ncores (%d)\n",
7042  __kmp_dflt_team_nth));
7043 #else
7044  // Default #threads = #available OS procs
7045  __kmp_dflt_team_nth = __kmp_avail_proc;
7046  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7047  "__kmp_avail_proc(%d)\n",
7048  __kmp_dflt_team_nth));
7049 #endif /* KMP_DFLT_NTH_CORES */
7050  }
7051 
7052  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7053  __kmp_dflt_team_nth = KMP_MIN_NTH;
7054  }
7055  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7056  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7057  }
7058 
7059  // There's no harm in continuing if the following check fails,
7060  // but it indicates an error in the previous logic.
7061  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7062 
7063  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7064  // Run through the __kmp_threads array and set the num threads icv for each
7065  // root thread that is currently registered with the RTL (which has not
7066  // already explicitly set its nthreads-var with a call to
7067  // omp_set_num_threads()).
7068  for (i = 0; i < __kmp_threads_capacity; i++) {
7069  kmp_info_t *thread = __kmp_threads[i];
7070  if (thread == NULL)
7071  continue;
7072  if (thread->th.th_current_task->td_icvs.nproc != 0)
7073  continue;
7074 
7075  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7076  }
7077  }
7078  KA_TRACE(
7079  20,
7080  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7081  __kmp_dflt_team_nth));
7082 
7083 #ifdef KMP_ADJUST_BLOCKTIME
7084  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7085  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7086  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7087  if (__kmp_nth > __kmp_avail_proc) {
7088  __kmp_zero_bt = TRUE;
7089  }
7090  }
7091 #endif /* KMP_ADJUST_BLOCKTIME */
7092 
7093  /* we have finished middle initialization */
7094  TCW_SYNC_4(__kmp_init_middle, TRUE);
7095 
7096  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7097 }
7098 
7099 void __kmp_middle_initialize(void) {
7100  if (__kmp_init_middle) {
7101  return;
7102  }
7103  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7104  if (__kmp_init_middle) {
7105  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7106  return;
7107  }
7108  __kmp_do_middle_initialize();
7109  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7110 }
7111 
7112 void __kmp_parallel_initialize(void) {
7113  int gtid = __kmp_entry_gtid(); // this might be a new root
7114 
7115  /* synchronize parallel initialization (for sibling) */
7116  if (TCR_4(__kmp_init_parallel))
7117  return;
7118  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7119  if (TCR_4(__kmp_init_parallel)) {
7120  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7121  return;
7122  }
7123 
7124  /* TODO reinitialization after we have already shut down */
7125  if (TCR_4(__kmp_global.g.g_done)) {
7126  KA_TRACE(
7127  10,
7128  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7129  __kmp_infinite_loop();
7130  }
7131 
7132  /* jc: The lock __kmp_initz_lock is already held, so calling
7133  __kmp_serial_initialize would cause a deadlock. So we call
7134  __kmp_do_serial_initialize directly. */
7135  if (!__kmp_init_middle) {
7136  __kmp_do_middle_initialize();
7137  }
7138  __kmp_resume_if_hard_paused();
7139 
7140  /* begin initialization */
7141  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7142  KMP_ASSERT(KMP_UBER_GTID(gtid));
7143 
7144 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7145  // Save the FP control regs.
7146  // Worker threads will set theirs to these values at thread startup.
7147  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7148  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7149  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7150 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7151 
7152 #if KMP_OS_UNIX
7153 #if KMP_HANDLE_SIGNALS
7154  /* must be after __kmp_serial_initialize */
7155  __kmp_install_signals(TRUE);
7156 #endif
7157 #endif
7158 
7159  __kmp_suspend_initialize();
7160 
7161 #if defined(USE_LOAD_BALANCE)
7162  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7163  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7164  }
7165 #else
7166  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7167  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7168  }
7169 #endif
7170 
7171  if (__kmp_version) {
7172  __kmp_print_version_2();
7173  }
7174 
7175  /* we have finished parallel initialization */
7176  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7177 
7178  KMP_MB();
7179  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7180 
7181  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7182 }
7183 
7184 void __kmp_hidden_helper_initialize() {
7185  if (TCR_4(__kmp_init_hidden_helper))
7186  return;
7187 
7188  // __kmp_parallel_initialize is required before we initialize hidden helper
7189  if (!TCR_4(__kmp_init_parallel))
7190  __kmp_parallel_initialize();
7191 
7192  // Double check. Note that this double check should not be placed before
7193  // __kmp_parallel_initialize as it will cause dead lock.
7194  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7195  if (TCR_4(__kmp_init_hidden_helper)) {
7196  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7197  return;
7198  }
7199 
7200  // Set the count of hidden helper tasks to be executed to zero
7201  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7202 
7203  // Set the global variable indicating that we're initializing hidden helper
7204  // team/threads
7205  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7206 
7207  // Platform independent initialization
7208  __kmp_do_initialize_hidden_helper_threads();
7209 
7210  // Wait here for the finish of initialization of hidden helper teams
7211  __kmp_hidden_helper_threads_initz_wait();
7212 
7213  // We have finished hidden helper initialization
7214  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7215 
7216  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7217 }
7218 
7219 /* ------------------------------------------------------------------------ */
7220 
7221 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7222  kmp_team_t *team) {
7223  kmp_disp_t *dispatch;
7224 
7225  KMP_MB();
7226 
7227  /* none of the threads have encountered any constructs, yet. */
7228  this_thr->th.th_local.this_construct = 0;
7229 #if KMP_CACHE_MANAGE
7230  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7231 #endif /* KMP_CACHE_MANAGE */
7232  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7233  KMP_DEBUG_ASSERT(dispatch);
7234  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7235  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7236  // this_thr->th.th_info.ds.ds_tid ] );
7237 
7238  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7239  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7240  if (__kmp_env_consistency_check)
7241  __kmp_push_parallel(gtid, team->t.t_ident);
7242 
7243  KMP_MB(); /* Flush all pending memory write invalidates. */
7244 }
7245 
7246 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7247  kmp_team_t *team) {
7248  if (__kmp_env_consistency_check)
7249  __kmp_pop_parallel(gtid, team->t.t_ident);
7250 
7251  __kmp_finish_implicit_task(this_thr);
7252 }
7253 
7254 int __kmp_invoke_task_func(int gtid) {
7255  int rc;
7256  int tid = __kmp_tid_from_gtid(gtid);
7257  kmp_info_t *this_thr = __kmp_threads[gtid];
7258  kmp_team_t *team = this_thr->th.th_team;
7259 
7260  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7261 #if USE_ITT_BUILD
7262  if (__itt_stack_caller_create_ptr) {
7263  __kmp_itt_stack_callee_enter(
7264  (__itt_caller)
7265  team->t.t_stack_id); // inform ittnotify about entering user's code
7266  }
7267 #endif /* USE_ITT_BUILD */
7268 #if INCLUDE_SSC_MARKS
7269  SSC_MARK_INVOKING();
7270 #endif
7271 
7272 #if OMPT_SUPPORT
7273  void *dummy;
7274  void **exit_frame_p;
7275  ompt_data_t *my_task_data;
7276  ompt_data_t *my_parallel_data;
7277  int ompt_team_size;
7278 
7279  if (ompt_enabled.enabled) {
7280  exit_frame_p = &(
7281  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7282  } else {
7283  exit_frame_p = &dummy;
7284  }
7285 
7286  my_task_data =
7287  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7288  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7289  if (ompt_enabled.ompt_callback_implicit_task) {
7290  ompt_team_size = team->t.t_nproc;
7291  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7292  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7293  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7294  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7295  }
7296 #endif
7297 
7298 #if KMP_STATS_ENABLED
7299  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7300  if (previous_state == stats_state_e::TEAMS_REGION) {
7301  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7302  } else {
7303  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7304  }
7305  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7306 #endif
7307 
7308  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7309  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7310 #if OMPT_SUPPORT
7311  ,
7312  exit_frame_p
7313 #endif
7314  );
7315 #if OMPT_SUPPORT
7316  *exit_frame_p = NULL;
7317  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7318 #endif
7319 
7320 #if KMP_STATS_ENABLED
7321  if (previous_state == stats_state_e::TEAMS_REGION) {
7322  KMP_SET_THREAD_STATE(previous_state);
7323  }
7324  KMP_POP_PARTITIONED_TIMER();
7325 #endif
7326 
7327 #if USE_ITT_BUILD
7328  if (__itt_stack_caller_create_ptr) {
7329  __kmp_itt_stack_callee_leave(
7330  (__itt_caller)
7331  team->t.t_stack_id); // inform ittnotify about leaving user's code
7332  }
7333 #endif /* USE_ITT_BUILD */
7334  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7335 
7336  return rc;
7337 }
7338 
7339 void __kmp_teams_master(int gtid) {
7340  // This routine is called by all master threads in teams construct
7341  kmp_info_t *thr = __kmp_threads[gtid];
7342  kmp_team_t *team = thr->th.th_team;
7343  ident_t *loc = team->t.t_ident;
7344  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7345  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7346  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7347  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7348  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7349 
7350  // This thread is a new CG root. Set up the proper variables.
7351  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7352  tmp->cg_root = thr; // Make thr the CG root
7353  // Init to thread limit that was stored when league masters were forked
7354  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7355  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7356  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7357  " cg_nthreads to 1\n",
7358  thr, tmp));
7359  tmp->up = thr->th.th_cg_roots;
7360  thr->th.th_cg_roots = tmp;
7361 
7362 // Launch league of teams now, but not let workers execute
7363 // (they hang on fork barrier until next parallel)
7364 #if INCLUDE_SSC_MARKS
7365  SSC_MARK_FORKING();
7366 #endif
7367  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7368  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7369  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7370 #if INCLUDE_SSC_MARKS
7371  SSC_MARK_JOINING();
7372 #endif
7373  // If the team size was reduced from the limit, set it to the new size
7374  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7375  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7376  // AC: last parameter "1" eliminates join barrier which won't work because
7377  // worker threads are in a fork barrier waiting for more parallel regions
7378  __kmp_join_call(loc, gtid
7379 #if OMPT_SUPPORT
7380  ,
7381  fork_context_intel
7382 #endif
7383  ,
7384  1);
7385 }
7386 
7387 int __kmp_invoke_teams_master(int gtid) {
7388  kmp_info_t *this_thr = __kmp_threads[gtid];
7389  kmp_team_t *team = this_thr->th.th_team;
7390 #if KMP_DEBUG
7391  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7392  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7393  (void *)__kmp_teams_master);
7394 #endif
7395  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7396 #if OMPT_SUPPORT
7397  int tid = __kmp_tid_from_gtid(gtid);
7398  ompt_data_t *task_data =
7399  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7400  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7401  if (ompt_enabled.ompt_callback_implicit_task) {
7402  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7403  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7404  ompt_task_initial);
7405  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7406  }
7407 #endif
7408  __kmp_teams_master(gtid);
7409 #if OMPT_SUPPORT
7410  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7411 #endif
7412  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7413  return 1;
7414 }
7415 
7416 /* this sets the requested number of threads for the next parallel region
7417  encountered by this team. since this should be enclosed in the forkjoin
7418  critical section it should avoid race conditions with asymmetrical nested
7419  parallelism */
7420 
7421 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7422  kmp_info_t *thr = __kmp_threads[gtid];
7423 
7424  if (num_threads > 0)
7425  thr->th.th_set_nproc = num_threads;
7426 }
7427 
7428 /* this sets the requested number of teams for the teams region and/or
7429  the number of threads for the next parallel region encountered */
7430 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7431  int num_threads) {
7432  kmp_info_t *thr = __kmp_threads[gtid];
7433  KMP_DEBUG_ASSERT(num_teams >= 0);
7434  KMP_DEBUG_ASSERT(num_threads >= 0);
7435 
7436  if (num_teams == 0)
7437  num_teams = 1; // default number of teams is 1.
7438  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7439  if (!__kmp_reserve_warn) {
7440  __kmp_reserve_warn = 1;
7441  __kmp_msg(kmp_ms_warning,
7442  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7443  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7444  }
7445  num_teams = __kmp_teams_max_nth;
7446  }
7447  // Set number of teams (number of threads in the outer "parallel" of the
7448  // teams)
7449  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7450 
7451  // Remember the number of threads for inner parallel regions
7452  if (!TCR_4(__kmp_init_middle))
7453  __kmp_middle_initialize(); // get internal globals calculated
7454  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7455  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7456  if (num_threads == 0) {
7457  num_threads = __kmp_avail_proc / num_teams;
7458  // adjust num_threads w/o warning as it is not user setting
7459  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7460  // no thread_limit clause specified - do not change thread-limit-var ICV
7461  if (num_threads > __kmp_dflt_team_nth) {
7462  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7463  }
7464  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7465  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7466  } // prevent team size to exceed thread-limit-var
7467  if (num_teams * num_threads > __kmp_teams_max_nth) {
7468  num_threads = __kmp_teams_max_nth / num_teams;
7469  }
7470  } else {
7471  // This thread will be the master of the league masters
7472  // Store new thread limit; old limit is saved in th_cg_roots list
7473  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7474  // num_threads = min(num_threads, nthreads-var)
7475  if (num_threads > __kmp_dflt_team_nth) {
7476  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7477  }
7478  if (num_teams * num_threads > __kmp_teams_max_nth) {
7479  int new_threads = __kmp_teams_max_nth / num_teams;
7480  if (!__kmp_reserve_warn) { // user asked for too many threads
7481  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7482  __kmp_msg(kmp_ms_warning,
7483  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7484  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7485  }
7486  num_threads = new_threads;
7487  }
7488  }
7489  thr->th.th_teams_size.nth = num_threads;
7490 }
7491 
7492 // Set the proc_bind var to use in the following parallel region.
7493 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7494  kmp_info_t *thr = __kmp_threads[gtid];
7495  thr->th.th_set_proc_bind = proc_bind;
7496 }
7497 
7498 /* Launch the worker threads into the microtask. */
7499 
7500 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7501  kmp_info_t *this_thr = __kmp_threads[gtid];
7502 
7503 #ifdef KMP_DEBUG
7504  int f;
7505 #endif /* KMP_DEBUG */
7506 
7507  KMP_DEBUG_ASSERT(team);
7508  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7509  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7510  KMP_MB(); /* Flush all pending memory write invalidates. */
7511 
7512  team->t.t_construct = 0; /* no single directives seen yet */
7513  team->t.t_ordered.dt.t_value =
7514  0; /* thread 0 enters the ordered section first */
7515 
7516  /* Reset the identifiers on the dispatch buffer */
7517  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7518  if (team->t.t_max_nproc > 1) {
7519  int i;
7520  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7521  team->t.t_disp_buffer[i].buffer_index = i;
7522  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7523  }
7524  } else {
7525  team->t.t_disp_buffer[0].buffer_index = 0;
7526  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7527  }
7528 
7529  KMP_MB(); /* Flush all pending memory write invalidates. */
7530  KMP_ASSERT(this_thr->th.th_team == team);
7531 
7532 #ifdef KMP_DEBUG
7533  for (f = 0; f < team->t.t_nproc; f++) {
7534  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7535  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7536  }
7537 #endif /* KMP_DEBUG */
7538 
7539  /* release the worker threads so they may begin working */
7540  __kmp_fork_barrier(gtid, 0);
7541 }
7542 
7543 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7544  kmp_info_t *this_thr = __kmp_threads[gtid];
7545 
7546  KMP_DEBUG_ASSERT(team);
7547  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7548  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7549  KMP_MB(); /* Flush all pending memory write invalidates. */
7550 
7551 /* Join barrier after fork */
7552 
7553 #ifdef KMP_DEBUG
7554  if (__kmp_threads[gtid] &&
7555  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7556  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7557  __kmp_threads[gtid]);
7558  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7559  "team->t.t_nproc=%d\n",
7560  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7561  team->t.t_nproc);
7562  __kmp_print_structure();
7563  }
7564  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7565  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7566 #endif /* KMP_DEBUG */
7567 
7568  __kmp_join_barrier(gtid); /* wait for everyone */
7569 #if OMPT_SUPPORT
7570  if (ompt_enabled.enabled &&
7571  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7572  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7573  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7574  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7575 #if OMPT_OPTIONAL
7576  void *codeptr = NULL;
7577  if (KMP_MASTER_TID(ds_tid) &&
7578  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7579  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7580  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7581 
7582  if (ompt_enabled.ompt_callback_sync_region_wait) {
7583  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7584  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7585  codeptr);
7586  }
7587  if (ompt_enabled.ompt_callback_sync_region) {
7588  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7589  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7590  codeptr);
7591  }
7592 #endif
7593  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7594  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7595  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7596  }
7597  }
7598 #endif
7599 
7600  KMP_MB(); /* Flush all pending memory write invalidates. */
7601  KMP_ASSERT(this_thr->th.th_team == team);
7602 }
7603 
7604 /* ------------------------------------------------------------------------ */
7605 
7606 #ifdef USE_LOAD_BALANCE
7607 
7608 // Return the worker threads actively spinning in the hot team, if we
7609 // are at the outermost level of parallelism. Otherwise, return 0.
7610 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7611  int i;
7612  int retval;
7613  kmp_team_t *hot_team;
7614 
7615  if (root->r.r_active) {
7616  return 0;
7617  }
7618  hot_team = root->r.r_hot_team;
7619  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7620  return hot_team->t.t_nproc - 1; // Don't count master thread
7621  }
7622 
7623  // Skip the master thread - it is accounted for elsewhere.
7624  retval = 0;
7625  for (i = 1; i < hot_team->t.t_nproc; i++) {
7626  if (hot_team->t.t_threads[i]->th.th_active) {
7627  retval++;
7628  }
7629  }
7630  return retval;
7631 }
7632 
7633 // Perform an automatic adjustment to the number of
7634 // threads used by the next parallel region.
7635 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7636  int retval;
7637  int pool_active;
7638  int hot_team_active;
7639  int team_curr_active;
7640  int system_active;
7641 
7642  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7643  set_nproc));
7644  KMP_DEBUG_ASSERT(root);
7645  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7646  ->th.th_current_task->td_icvs.dynamic == TRUE);
7647  KMP_DEBUG_ASSERT(set_nproc > 1);
7648 
7649  if (set_nproc == 1) {
7650  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7651  return 1;
7652  }
7653 
7654  // Threads that are active in the thread pool, active in the hot team for this
7655  // particular root (if we are at the outer par level), and the currently
7656  // executing thread (to become the master) are available to add to the new
7657  // team, but are currently contributing to the system load, and must be
7658  // accounted for.
7659  pool_active = __kmp_thread_pool_active_nth;
7660  hot_team_active = __kmp_active_hot_team_nproc(root);
7661  team_curr_active = pool_active + hot_team_active + 1;
7662 
7663  // Check the system load.
7664  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7665  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7666  "hot team active = %d\n",
7667  system_active, pool_active, hot_team_active));
7668 
7669  if (system_active < 0) {
7670  // There was an error reading the necessary info from /proc, so use the
7671  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7672  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7673  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7674  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7675 
7676  // Make this call behave like the thread limit algorithm.
7677  retval = __kmp_avail_proc - __kmp_nth +
7678  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7679  if (retval > set_nproc) {
7680  retval = set_nproc;
7681  }
7682  if (retval < KMP_MIN_NTH) {
7683  retval = KMP_MIN_NTH;
7684  }
7685 
7686  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7687  retval));
7688  return retval;
7689  }
7690 
7691  // There is a slight delay in the load balance algorithm in detecting new
7692  // running procs. The real system load at this instant should be at least as
7693  // large as the #active omp thread that are available to add to the team.
7694  if (system_active < team_curr_active) {
7695  system_active = team_curr_active;
7696  }
7697  retval = __kmp_avail_proc - system_active + team_curr_active;
7698  if (retval > set_nproc) {
7699  retval = set_nproc;
7700  }
7701  if (retval < KMP_MIN_NTH) {
7702  retval = KMP_MIN_NTH;
7703  }
7704 
7705  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7706  return retval;
7707 } // __kmp_load_balance_nproc()
7708 
7709 #endif /* USE_LOAD_BALANCE */
7710 
7711 /* ------------------------------------------------------------------------ */
7712 
7713 /* NOTE: this is called with the __kmp_init_lock held */
7714 void __kmp_cleanup(void) {
7715  int f;
7716 
7717  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7718 
7719  if (TCR_4(__kmp_init_parallel)) {
7720 #if KMP_HANDLE_SIGNALS
7721  __kmp_remove_signals();
7722 #endif
7723  TCW_4(__kmp_init_parallel, FALSE);
7724  }
7725 
7726  if (TCR_4(__kmp_init_middle)) {
7727 #if KMP_AFFINITY_SUPPORTED
7728  __kmp_affinity_uninitialize();
7729 #endif /* KMP_AFFINITY_SUPPORTED */
7730  __kmp_cleanup_hierarchy();
7731  TCW_4(__kmp_init_middle, FALSE);
7732  }
7733 
7734  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7735 
7736  if (__kmp_init_serial) {
7737  __kmp_runtime_destroy();
7738  __kmp_init_serial = FALSE;
7739  }
7740 
7741  __kmp_cleanup_threadprivate_caches();
7742 
7743  for (f = 0; f < __kmp_threads_capacity; f++) {
7744  if (__kmp_root[f] != NULL) {
7745  __kmp_free(__kmp_root[f]);
7746  __kmp_root[f] = NULL;
7747  }
7748  }
7749  __kmp_free(__kmp_threads);
7750  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7751  // there is no need in freeing __kmp_root.
7752  __kmp_threads = NULL;
7753  __kmp_root = NULL;
7754  __kmp_threads_capacity = 0;
7755 
7756 #if KMP_USE_DYNAMIC_LOCK
7757  __kmp_cleanup_indirect_user_locks();
7758 #else
7759  __kmp_cleanup_user_locks();
7760 #endif
7761 
7762 #if KMP_AFFINITY_SUPPORTED
7763  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7764  __kmp_cpuinfo_file = NULL;
7765 #endif /* KMP_AFFINITY_SUPPORTED */
7766 
7767 #if KMP_USE_ADAPTIVE_LOCKS
7768 #if KMP_DEBUG_ADAPTIVE_LOCKS
7769  __kmp_print_speculative_stats();
7770 #endif
7771 #endif
7772  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7773  __kmp_nested_nth.nth = NULL;
7774  __kmp_nested_nth.size = 0;
7775  __kmp_nested_nth.used = 0;
7776  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7777  __kmp_nested_proc_bind.bind_types = NULL;
7778  __kmp_nested_proc_bind.size = 0;
7779  __kmp_nested_proc_bind.used = 0;
7780  if (__kmp_affinity_format) {
7781  KMP_INTERNAL_FREE(__kmp_affinity_format);
7782  __kmp_affinity_format = NULL;
7783  }
7784 
7785  __kmp_i18n_catclose();
7786 
7787 #if KMP_USE_HIER_SCHED
7788  __kmp_hier_scheds.deallocate();
7789 #endif
7790 
7791 #if KMP_STATS_ENABLED
7792  __kmp_stats_fini();
7793 #endif
7794 
7795  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7796 }
7797 
7798 /* ------------------------------------------------------------------------ */
7799 
7800 int __kmp_ignore_mppbeg(void) {
7801  char *env;
7802 
7803  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7804  if (__kmp_str_match_false(env))
7805  return FALSE;
7806  }
7807  // By default __kmpc_begin() is no-op.
7808  return TRUE;
7809 }
7810 
7811 int __kmp_ignore_mppend(void) {
7812  char *env;
7813 
7814  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7815  if (__kmp_str_match_false(env))
7816  return FALSE;
7817  }
7818  // By default __kmpc_end() is no-op.
7819  return TRUE;
7820 }
7821 
7822 void __kmp_internal_begin(void) {
7823  int gtid;
7824  kmp_root_t *root;
7825 
7826  /* this is a very important step as it will register new sibling threads
7827  and assign these new uber threads a new gtid */
7828  gtid = __kmp_entry_gtid();
7829  root = __kmp_threads[gtid]->th.th_root;
7830  KMP_ASSERT(KMP_UBER_GTID(gtid));
7831 
7832  if (root->r.r_begin)
7833  return;
7834  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7835  if (root->r.r_begin) {
7836  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7837  return;
7838  }
7839 
7840  root->r.r_begin = TRUE;
7841 
7842  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7843 }
7844 
7845 /* ------------------------------------------------------------------------ */
7846 
7847 void __kmp_user_set_library(enum library_type arg) {
7848  int gtid;
7849  kmp_root_t *root;
7850  kmp_info_t *thread;
7851 
7852  /* first, make sure we are initialized so we can get our gtid */
7853 
7854  gtid = __kmp_entry_gtid();
7855  thread = __kmp_threads[gtid];
7856 
7857  root = thread->th.th_root;
7858 
7859  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7860  library_serial));
7861  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7862  thread */
7863  KMP_WARNING(SetLibraryIncorrectCall);
7864  return;
7865  }
7866 
7867  switch (arg) {
7868  case library_serial:
7869  thread->th.th_set_nproc = 0;
7870  set__nproc(thread, 1);
7871  break;
7872  case library_turnaround:
7873  thread->th.th_set_nproc = 0;
7874  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7875  : __kmp_dflt_team_nth_ub);
7876  break;
7877  case library_throughput:
7878  thread->th.th_set_nproc = 0;
7879  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7880  : __kmp_dflt_team_nth_ub);
7881  break;
7882  default:
7883  KMP_FATAL(UnknownLibraryType, arg);
7884  }
7885 
7886  __kmp_aux_set_library(arg);
7887 }
7888 
7889 void __kmp_aux_set_stacksize(size_t arg) {
7890  if (!__kmp_init_serial)
7891  __kmp_serial_initialize();
7892 
7893 #if KMP_OS_DARWIN
7894  if (arg & (0x1000 - 1)) {
7895  arg &= ~(0x1000 - 1);
7896  if (arg + 0x1000) /* check for overflow if we round up */
7897  arg += 0x1000;
7898  }
7899 #endif
7900  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7901 
7902  /* only change the default stacksize before the first parallel region */
7903  if (!TCR_4(__kmp_init_parallel)) {
7904  size_t value = arg; /* argument is in bytes */
7905 
7906  if (value < __kmp_sys_min_stksize)
7907  value = __kmp_sys_min_stksize;
7908  else if (value > KMP_MAX_STKSIZE)
7909  value = KMP_MAX_STKSIZE;
7910 
7911  __kmp_stksize = value;
7912 
7913  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7914  }
7915 
7916  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7917 }
7918 
7919 /* set the behaviour of the runtime library */
7920 /* TODO this can cause some odd behaviour with sibling parallelism... */
7921 void __kmp_aux_set_library(enum library_type arg) {
7922  __kmp_library = arg;
7923 
7924  switch (__kmp_library) {
7925  case library_serial: {
7926  KMP_INFORM(LibraryIsSerial);
7927  } break;
7928  case library_turnaround:
7929  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7930  __kmp_use_yield = 2; // only yield when oversubscribed
7931  break;
7932  case library_throughput:
7933  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7934  __kmp_dflt_blocktime = 200;
7935  break;
7936  default:
7937  KMP_FATAL(UnknownLibraryType, arg);
7938  }
7939 }
7940 
7941 /* Getting team information common for all team API */
7942 // Returns NULL if not in teams construct
7943 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7944  kmp_info_t *thr = __kmp_entry_thread();
7945  teams_serialized = 0;
7946  if (thr->th.th_teams_microtask) {
7947  kmp_team_t *team = thr->th.th_team;
7948  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7949  int ii = team->t.t_level;
7950  teams_serialized = team->t.t_serialized;
7951  int level = tlevel + 1;
7952  KMP_DEBUG_ASSERT(ii >= tlevel);
7953  while (ii > level) {
7954  for (teams_serialized = team->t.t_serialized;
7955  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7956  }
7957  if (team->t.t_serialized && (!teams_serialized)) {
7958  team = team->t.t_parent;
7959  continue;
7960  }
7961  if (ii > level) {
7962  team = team->t.t_parent;
7963  ii--;
7964  }
7965  }
7966  return team;
7967  }
7968  return NULL;
7969 }
7970 
7971 int __kmp_aux_get_team_num() {
7972  int serialized;
7973  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7974  if (team) {
7975  if (serialized > 1) {
7976  return 0; // teams region is serialized ( 1 team of 1 thread ).
7977  } else {
7978  return team->t.t_master_tid;
7979  }
7980  }
7981  return 0;
7982 }
7983 
7984 int __kmp_aux_get_num_teams() {
7985  int serialized;
7986  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7987  if (team) {
7988  if (serialized > 1) {
7989  return 1;
7990  } else {
7991  return team->t.t_parent->t.t_nproc;
7992  }
7993  }
7994  return 1;
7995 }
7996 
7997 /* ------------------------------------------------------------------------ */
7998 
7999 /*
8000  * Affinity Format Parser
8001  *
8002  * Field is in form of: %[[[0].]size]type
8003  * % and type are required (%% means print a literal '%')
8004  * type is either single char or long name surrounded by {},
8005  * e.g., N or {num_threads}
8006  * 0 => leading zeros
8007  * . => right justified when size is specified
8008  * by default output is left justified
8009  * size is the *minimum* field length
8010  * All other characters are printed as is
8011  *
8012  * Available field types:
8013  * L {thread_level} - omp_get_level()
8014  * n {thread_num} - omp_get_thread_num()
8015  * h {host} - name of host machine
8016  * P {process_id} - process id (integer)
8017  * T {thread_identifier} - native thread identifier (integer)
8018  * N {num_threads} - omp_get_num_threads()
8019  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8020  * a {thread_affinity} - comma separated list of integers or integer ranges
8021  * (values of affinity mask)
8022  *
8023  * Implementation-specific field types can be added
8024  * If a type is unknown, print "undefined"
8025 */
8026 
8027 // Structure holding the short name, long name, and corresponding data type
8028 // for snprintf. A table of these will represent the entire valid keyword
8029 // field types.
8030 typedef struct kmp_affinity_format_field_t {
8031  char short_name; // from spec e.g., L -> thread level
8032  const char *long_name; // from spec thread_level -> thread level
8033  char field_format; // data type for snprintf (typically 'd' or 's'
8034  // for integer or string)
8035 } kmp_affinity_format_field_t;
8036 
8037 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8038 #if KMP_AFFINITY_SUPPORTED
8039  {'A', "thread_affinity", 's'},
8040 #endif
8041  {'t', "team_num", 'd'},
8042  {'T', "num_teams", 'd'},
8043  {'L', "nesting_level", 'd'},
8044  {'n', "thread_num", 'd'},
8045  {'N', "num_threads", 'd'},
8046  {'a', "ancestor_tnum", 'd'},
8047  {'H', "host", 's'},
8048  {'P', "process_id", 'd'},
8049  {'i', "native_thread_id", 'd'}};
8050 
8051 // Return the number of characters it takes to hold field
8052 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8053  const char **ptr,
8054  kmp_str_buf_t *field_buffer) {
8055  int rc, format_index, field_value;
8056  const char *width_left, *width_right;
8057  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8058  static const int FORMAT_SIZE = 20;
8059  char format[FORMAT_SIZE] = {0};
8060  char absolute_short_name = 0;
8061 
8062  KMP_DEBUG_ASSERT(gtid >= 0);
8063  KMP_DEBUG_ASSERT(th);
8064  KMP_DEBUG_ASSERT(**ptr == '%');
8065  KMP_DEBUG_ASSERT(field_buffer);
8066 
8067  __kmp_str_buf_clear(field_buffer);
8068 
8069  // Skip the initial %
8070  (*ptr)++;
8071 
8072  // Check for %% first
8073  if (**ptr == '%') {
8074  __kmp_str_buf_cat(field_buffer, "%", 1);
8075  (*ptr)++; // skip over the second %
8076  return 1;
8077  }
8078 
8079  // Parse field modifiers if they are present
8080  pad_zeros = false;
8081  if (**ptr == '0') {
8082  pad_zeros = true;
8083  (*ptr)++; // skip over 0
8084  }
8085  right_justify = false;
8086  if (**ptr == '.') {
8087  right_justify = true;
8088  (*ptr)++; // skip over .
8089  }
8090  // Parse width of field: [width_left, width_right)
8091  width_left = width_right = NULL;
8092  if (**ptr >= '0' && **ptr <= '9') {
8093  width_left = *ptr;
8094  SKIP_DIGITS(*ptr);
8095  width_right = *ptr;
8096  }
8097 
8098  // Create the format for KMP_SNPRINTF based on flags parsed above
8099  format_index = 0;
8100  format[format_index++] = '%';
8101  if (!right_justify)
8102  format[format_index++] = '-';
8103  if (pad_zeros)
8104  format[format_index++] = '0';
8105  if (width_left && width_right) {
8106  int i = 0;
8107  // Only allow 8 digit number widths.
8108  // This also prevents overflowing format variable
8109  while (i < 8 && width_left < width_right) {
8110  format[format_index++] = *width_left;
8111  width_left++;
8112  i++;
8113  }
8114  }
8115 
8116  // Parse a name (long or short)
8117  // Canonicalize the name into absolute_short_name
8118  found_valid_name = false;
8119  parse_long_name = (**ptr == '{');
8120  if (parse_long_name)
8121  (*ptr)++; // skip initial left brace
8122  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8123  sizeof(__kmp_affinity_format_table[0]);
8124  ++i) {
8125  char short_name = __kmp_affinity_format_table[i].short_name;
8126  const char *long_name = __kmp_affinity_format_table[i].long_name;
8127  char field_format = __kmp_affinity_format_table[i].field_format;
8128  if (parse_long_name) {
8129  size_t length = KMP_STRLEN(long_name);
8130  if (strncmp(*ptr, long_name, length) == 0) {
8131  found_valid_name = true;
8132  (*ptr) += length; // skip the long name
8133  }
8134  } else if (**ptr == short_name) {
8135  found_valid_name = true;
8136  (*ptr)++; // skip the short name
8137  }
8138  if (found_valid_name) {
8139  format[format_index++] = field_format;
8140  format[format_index++] = '\0';
8141  absolute_short_name = short_name;
8142  break;
8143  }
8144  }
8145  if (parse_long_name) {
8146  if (**ptr != '}') {
8147  absolute_short_name = 0;
8148  } else {
8149  (*ptr)++; // skip over the right brace
8150  }
8151  }
8152 
8153  // Attempt to fill the buffer with the requested
8154  // value using snprintf within __kmp_str_buf_print()
8155  switch (absolute_short_name) {
8156  case 't':
8157  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8158  break;
8159  case 'T':
8160  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8161  break;
8162  case 'L':
8163  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8164  break;
8165  case 'n':
8166  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8167  break;
8168  case 'H': {
8169  static const int BUFFER_SIZE = 256;
8170  char buf[BUFFER_SIZE];
8171  __kmp_expand_host_name(buf, BUFFER_SIZE);
8172  rc = __kmp_str_buf_print(field_buffer, format, buf);
8173  } break;
8174  case 'P':
8175  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8176  break;
8177  case 'i':
8178  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8179  break;
8180  case 'N':
8181  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8182  break;
8183  case 'a':
8184  field_value =
8185  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8186  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8187  break;
8188 #if KMP_AFFINITY_SUPPORTED
8189  case 'A': {
8190  kmp_str_buf_t buf;
8191  __kmp_str_buf_init(&buf);
8192  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8193  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8194  __kmp_str_buf_free(&buf);
8195  } break;
8196 #endif
8197  default:
8198  // According to spec, If an implementation does not have info for field
8199  // type, then "undefined" is printed
8200  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8201  // Skip the field
8202  if (parse_long_name) {
8203  SKIP_TOKEN(*ptr);
8204  if (**ptr == '}')
8205  (*ptr)++;
8206  } else {
8207  (*ptr)++;
8208  }
8209  }
8210 
8211  KMP_ASSERT(format_index <= FORMAT_SIZE);
8212  return rc;
8213 }
8214 
8215 /*
8216  * Return number of characters needed to hold the affinity string
8217  * (not including null byte character)
8218  * The resultant string is printed to buffer, which the caller can then
8219  * handle afterwards
8220 */
8221 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8222  kmp_str_buf_t *buffer) {
8223  const char *parse_ptr;
8224  size_t retval;
8225  const kmp_info_t *th;
8226  kmp_str_buf_t field;
8227 
8228  KMP_DEBUG_ASSERT(buffer);
8229  KMP_DEBUG_ASSERT(gtid >= 0);
8230 
8231  __kmp_str_buf_init(&field);
8232  __kmp_str_buf_clear(buffer);
8233 
8234  th = __kmp_threads[gtid];
8235  retval = 0;
8236 
8237  // If format is NULL or zero-length string, then we use
8238  // affinity-format-var ICV
8239  parse_ptr = format;
8240  if (parse_ptr == NULL || *parse_ptr == '\0') {
8241  parse_ptr = __kmp_affinity_format;
8242  }
8243  KMP_DEBUG_ASSERT(parse_ptr);
8244 
8245  while (*parse_ptr != '\0') {
8246  // Parse a field
8247  if (*parse_ptr == '%') {
8248  // Put field in the buffer
8249  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8250  __kmp_str_buf_catbuf(buffer, &field);
8251  retval += rc;
8252  } else {
8253  // Put literal character in buffer
8254  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8255  retval++;
8256  parse_ptr++;
8257  }
8258  }
8259  __kmp_str_buf_free(&field);
8260  return retval;
8261 }
8262 
8263 // Displays the affinity string to stdout
8264 void __kmp_aux_display_affinity(int gtid, const char *format) {
8265  kmp_str_buf_t buf;
8266  __kmp_str_buf_init(&buf);
8267  __kmp_aux_capture_affinity(gtid, format, &buf);
8268  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8269  __kmp_str_buf_free(&buf);
8270 }
8271 
8272 /* ------------------------------------------------------------------------ */
8273 
8274 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8275  int blocktime = arg; /* argument is in milliseconds */
8276 #if KMP_USE_MONITOR
8277  int bt_intervals;
8278 #endif
8279  kmp_int8 bt_set;
8280 
8281  __kmp_save_internal_controls(thread);
8282 
8283  /* Normalize and set blocktime for the teams */
8284  if (blocktime < KMP_MIN_BLOCKTIME)
8285  blocktime = KMP_MIN_BLOCKTIME;
8286  else if (blocktime > KMP_MAX_BLOCKTIME)
8287  blocktime = KMP_MAX_BLOCKTIME;
8288 
8289  set__blocktime_team(thread->th.th_team, tid, blocktime);
8290  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8291 
8292 #if KMP_USE_MONITOR
8293  /* Calculate and set blocktime intervals for the teams */
8294  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8295 
8296  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8297  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8298 #endif
8299 
8300  /* Set whether blocktime has been set to "TRUE" */
8301  bt_set = TRUE;
8302 
8303  set__bt_set_team(thread->th.th_team, tid, bt_set);
8304  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8305 #if KMP_USE_MONITOR
8306  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8307  "bt_intervals=%d, monitor_updates=%d\n",
8308  __kmp_gtid_from_tid(tid, thread->th.th_team),
8309  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8310  __kmp_monitor_wakeups));
8311 #else
8312  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8313  __kmp_gtid_from_tid(tid, thread->th.th_team),
8314  thread->th.th_team->t.t_id, tid, blocktime));
8315 #endif
8316 }
8317 
8318 void __kmp_aux_set_defaults(char const *str, size_t len) {
8319  if (!__kmp_init_serial) {
8320  __kmp_serial_initialize();
8321  }
8322  __kmp_env_initialize(str);
8323 
8324  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8325  __kmp_env_print();
8326  }
8327 } // __kmp_aux_set_defaults
8328 
8329 /* ------------------------------------------------------------------------ */
8330 /* internal fast reduction routines */
8331 
8332 PACKED_REDUCTION_METHOD_T
8333 __kmp_determine_reduction_method(
8334  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8335  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8336  kmp_critical_name *lck) {
8337 
8338  // Default reduction method: critical construct ( lck != NULL, like in current
8339  // PAROPT )
8340  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8341  // can be selected by RTL
8342  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8343  // can be selected by RTL
8344  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8345  // among generated by PAROPT.
8346 
8347  PACKED_REDUCTION_METHOD_T retval;
8348 
8349  int team_size;
8350 
8351  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8352  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8353 
8354 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8355  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8356 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8357 
8358  retval = critical_reduce_block;
8359 
8360  // another choice of getting a team size (with 1 dynamic deference) is slower
8361  team_size = __kmp_get_team_num_threads(global_tid);
8362  if (team_size == 1) {
8363 
8364  retval = empty_reduce_block;
8365 
8366  } else {
8367 
8368  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8369 
8370 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8371  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8372 
8373 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8374  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8375 
8376  int teamsize_cutoff = 4;
8377 
8378 #if KMP_MIC_SUPPORTED
8379  if (__kmp_mic_type != non_mic) {
8380  teamsize_cutoff = 8;
8381  }
8382 #endif
8383  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8384  if (tree_available) {
8385  if (team_size <= teamsize_cutoff) {
8386  if (atomic_available) {
8387  retval = atomic_reduce_block;
8388  }
8389  } else {
8390  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8391  }
8392  } else if (atomic_available) {
8393  retval = atomic_reduce_block;
8394  }
8395 #else
8396 #error "Unknown or unsupported OS"
8397 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8398  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8399 
8400 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8401 
8402 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8403 
8404  // basic tuning
8405 
8406  if (atomic_available) {
8407  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8408  retval = atomic_reduce_block;
8409  }
8410  } // otherwise: use critical section
8411 
8412 #elif KMP_OS_DARWIN
8413 
8414  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8415  if (atomic_available && (num_vars <= 3)) {
8416  retval = atomic_reduce_block;
8417  } else if (tree_available) {
8418  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8419  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8420  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8421  }
8422  } // otherwise: use critical section
8423 
8424 #else
8425 #error "Unknown or unsupported OS"
8426 #endif
8427 
8428 #else
8429 #error "Unknown or unsupported architecture"
8430 #endif
8431  }
8432 
8433  // KMP_FORCE_REDUCTION
8434 
8435  // If the team is serialized (team_size == 1), ignore the forced reduction
8436  // method and stay with the unsynchronized method (empty_reduce_block)
8437  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8438  team_size != 1) {
8439 
8440  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8441 
8442  int atomic_available, tree_available;
8443 
8444  switch ((forced_retval = __kmp_force_reduction_method)) {
8445  case critical_reduce_block:
8446  KMP_ASSERT(lck); // lck should be != 0
8447  break;
8448 
8449  case atomic_reduce_block:
8450  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8451  if (!atomic_available) {
8452  KMP_WARNING(RedMethodNotSupported, "atomic");
8453  forced_retval = critical_reduce_block;
8454  }
8455  break;
8456 
8457  case tree_reduce_block:
8458  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8459  if (!tree_available) {
8460  KMP_WARNING(RedMethodNotSupported, "tree");
8461  forced_retval = critical_reduce_block;
8462  } else {
8463 #if KMP_FAST_REDUCTION_BARRIER
8464  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8465 #endif
8466  }
8467  break;
8468 
8469  default:
8470  KMP_ASSERT(0); // "unsupported method specified"
8471  }
8472 
8473  retval = forced_retval;
8474  }
8475 
8476  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8477 
8478 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8479 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8480 
8481  return (retval);
8482 }
8483 // this function is for testing set/get/determine reduce method
8484 kmp_int32 __kmp_get_reduce_method(void) {
8485  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8486 }
8487 
8488 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8489 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8490 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8491 
8492 // Hard pause shuts down the runtime completely. Resume happens naturally when
8493 // OpenMP is used subsequently.
8494 void __kmp_hard_pause() {
8495  __kmp_pause_status = kmp_hard_paused;
8496  __kmp_internal_end_thread(-1);
8497 }
8498 
8499 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8500 void __kmp_resume_if_soft_paused() {
8501  if (__kmp_pause_status == kmp_soft_paused) {
8502  __kmp_pause_status = kmp_not_paused;
8503 
8504  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8505  kmp_info_t *thread = __kmp_threads[gtid];
8506  if (thread) { // Wake it if sleeping
8507  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8508  thread);
8509  if (fl.is_sleeping())
8510  fl.resume(gtid);
8511  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8512  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8513  } else { // thread holds the lock and may sleep soon
8514  do { // until either the thread sleeps, or we can get the lock
8515  if (fl.is_sleeping()) {
8516  fl.resume(gtid);
8517  break;
8518  } else if (__kmp_try_suspend_mx(thread)) {
8519  __kmp_unlock_suspend_mx(thread);
8520  break;
8521  }
8522  } while (1);
8523  }
8524  }
8525  }
8526  }
8527 }
8528 
8529 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8530 // TODO: add warning messages
8531 int __kmp_pause_resource(kmp_pause_status_t level) {
8532  if (level == kmp_not_paused) { // requesting resume
8533  if (__kmp_pause_status == kmp_not_paused) {
8534  // error message about runtime not being paused, so can't resume
8535  return 1;
8536  } else {
8537  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8538  __kmp_pause_status == kmp_hard_paused);
8539  __kmp_pause_status = kmp_not_paused;
8540  return 0;
8541  }
8542  } else if (level == kmp_soft_paused) { // requesting soft pause
8543  if (__kmp_pause_status != kmp_not_paused) {
8544  // error message about already being paused
8545  return 1;
8546  } else {
8547  __kmp_soft_pause();
8548  return 0;
8549  }
8550  } else if (level == kmp_hard_paused) { // requesting hard pause
8551  if (__kmp_pause_status != kmp_not_paused) {
8552  // error message about already being paused
8553  return 1;
8554  } else {
8555  __kmp_hard_pause();
8556  return 0;
8557  }
8558  } else {
8559  // error message about invalid level
8560  return 1;
8561  }
8562 }
8563 
8564 void __kmp_omp_display_env(int verbose) {
8565  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8566  if (__kmp_init_serial == 0)
8567  __kmp_do_serial_initialize();
8568  __kmp_display_env_impl(!verbose, verbose);
8569  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8570 }
8571 
8572 // Globals and functions for hidden helper task
8573 kmp_info_t **__kmp_hidden_helper_threads;
8574 kmp_info_t *__kmp_hidden_helper_main_thread;
8575 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8576 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8577 #if KMP_OS_LINUX
8578 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8579 #else
8580 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8581 #endif
8582 
8583 namespace {
8584 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8585 
8586 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8587  // This is an explicit synchronization on all hidden helper threads in case
8588  // that when a regular thread pushes a hidden helper task to one hidden
8589  // helper thread, the thread has not been awaken once since they're released
8590  // by the main thread after creating the team.
8591  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8592  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8593  __kmp_hidden_helper_threads_num)
8594  ;
8595 
8596  // If main thread, then wait for signal
8597  if (__kmpc_master(nullptr, *gtid)) {
8598  // First, unset the initial state and release the initial thread
8599  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8600  __kmp_hidden_helper_initz_release();
8601  __kmp_hidden_helper_main_thread_wait();
8602  // Now wake up all worker threads
8603  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8604  __kmp_hidden_helper_worker_thread_signal();
8605  }
8606  }
8607 }
8608 } // namespace
8609 
8610 void __kmp_hidden_helper_threads_initz_routine() {
8611  // Create a new root for hidden helper team/threads
8612  const int gtid = __kmp_register_root(TRUE);
8613  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8614  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8615  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8616  __kmp_hidden_helper_threads_num;
8617 
8618  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8619 
8620  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8621 
8622  // Set the initialization flag to FALSE
8623  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8624 
8625  __kmp_hidden_helper_threads_deinitz_release();
8626 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:194
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:930
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:888
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:351
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:358
@ kmp_sch_static
Definition: kmp.h:354
@ kmp_sch_guided_chunked
Definition: kmp.h:356
Definition: kmp.h:229
kmp_int32 flags
Definition: kmp.h:231