LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46  KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] =
50  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] =
54  KMP_VERSION_PREFIX "lock type: run time selectable";
55 #endif /* KMP_DEBUG */
56 
57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
58 
59 /* ------------------------------------------------------------------------ */
60 
61 #if KMP_USE_MONITOR
62 kmp_info_t __kmp_monitor;
63 #endif
64 
65 /* Forward declarations */
66 
67 void __kmp_cleanup(void);
68 
69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
70  int gtid);
71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
72  kmp_internal_control_t *new_icvs,
73  ident_t *loc);
74 #if KMP_AFFINITY_SUPPORTED
75 static void __kmp_partition_places(kmp_team_t *team,
76  int update_master_only = 0);
77 #endif
78 static void __kmp_do_serial_initialize(void);
79 void __kmp_fork_barrier(int gtid, int tid);
80 void __kmp_join_barrier(int gtid);
81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
82  kmp_internal_control_t *new_icvs, ident_t *loc);
83 
84 #ifdef USE_LOAD_BALANCE
85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
86 #endif
87 
88 static int __kmp_expand_threads(int nNeed);
89 #if KMP_OS_WINDOWS
90 static int __kmp_unregister_root_other_thread(int gtid);
91 #endif
92 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
95 
96 /* Calculate the identifier of the current thread */
97 /* fast (and somewhat portable) way to get unique identifier of executing
98  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
99 int __kmp_get_global_thread_id() {
100  int i;
101  kmp_info_t **other_threads;
102  size_t stack_data;
103  char *stack_addr;
104  size_t stack_size;
105  char *stack_base;
106 
107  KA_TRACE(
108  1000,
109  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
110  __kmp_nth, __kmp_all_nth));
111 
112  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
113  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
114  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
115  __kmp_init_gtid for this to work. */
116 
117  if (!TCR_4(__kmp_init_gtid))
118  return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121  if (TCR_4(__kmp_gtid_mode) >= 3) {
122  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
123  return __kmp_gtid;
124  }
125 #endif
126  if (TCR_4(__kmp_gtid_mode) >= 2) {
127  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
128  return __kmp_gtid_get_specific();
129  }
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
131 
132  stack_addr = (char *)&stack_data;
133  other_threads = __kmp_threads;
134 
135  /* ATT: The code below is a source of potential bugs due to unsynchronized
136  access to __kmp_threads array. For example:
137  1. Current thread loads other_threads[i] to thr and checks it, it is
138  non-NULL.
139  2. Current thread is suspended by OS.
140  3. Another thread unregisters and finishes (debug versions of free()
141  may fill memory with something like 0xEF).
142  4. Current thread is resumed.
143  5. Current thread reads junk from *thr.
144  TODO: Fix it. --ln */
145 
146  for (i = 0; i < __kmp_threads_capacity; i++) {
147 
148  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
149  if (!thr)
150  continue;
151 
152  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
153  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
154 
155  /* stack grows down -- search through all of the active threads */
156 
157  if (stack_addr <= stack_base) {
158  size_t stack_diff = stack_base - stack_addr;
159 
160  if (stack_diff <= stack_size) {
161  /* The only way we can be closer than the allocated */
162  /* stack size is if we are running on this thread. */
163  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
164  return i;
165  }
166  }
167  }
168 
169  /* get specific to try and determine our gtid */
170  KA_TRACE(1000,
171  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
172  "thread, using TLS\n"));
173  i = __kmp_gtid_get_specific();
174 
175  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
176 
177  /* if we havn't been assigned a gtid, then return code */
178  if (i < 0)
179  return i;
180 
181  /* dynamically updated stack window for uber threads to avoid get_specific
182  call */
183  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
184  KMP_FATAL(StackOverflow, i);
185  }
186 
187  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
188  if (stack_addr > stack_base) {
189  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
190  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
191  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
192  stack_base);
193  } else {
194  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195  stack_base - stack_addr);
196  }
197 
198  /* Reprint stack bounds for ubermaster since they have been refined */
199  if (__kmp_storage_map) {
200  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
202  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
203  other_threads[i]->th.th_info.ds.ds_stacksize,
204  "th_%d stack (refinement)", i);
205  }
206  return i;
207 }
208 
209 int __kmp_get_global_thread_id_reg() {
210  int gtid;
211 
212  if (!__kmp_init_serial) {
213  gtid = KMP_GTID_DNE;
214  } else
215 #ifdef KMP_TDATA_GTID
216  if (TCR_4(__kmp_gtid_mode) >= 3) {
217  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
218  gtid = __kmp_gtid;
219  } else
220 #endif
221  if (TCR_4(__kmp_gtid_mode) >= 2) {
222  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
223  gtid = __kmp_gtid_get_specific();
224  } else {
225  KA_TRACE(1000,
226  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
227  gtid = __kmp_get_global_thread_id();
228  }
229 
230  /* we must be a new uber master sibling thread */
231  if (gtid == KMP_GTID_DNE) {
232  KA_TRACE(10,
233  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
234  "Registering a new gtid.\n"));
235  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
236  if (!__kmp_init_serial) {
237  __kmp_do_serial_initialize();
238  gtid = __kmp_gtid_get_specific();
239  } else {
240  gtid = __kmp_register_root(FALSE);
241  }
242  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
243  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
244  }
245 
246  KMP_DEBUG_ASSERT(gtid >= 0);
247 
248  return gtid;
249 }
250 
251 /* caller must hold forkjoin_lock */
252 void __kmp_check_stack_overlap(kmp_info_t *th) {
253  int f;
254  char *stack_beg = NULL;
255  char *stack_end = NULL;
256  int gtid;
257 
258  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
259  if (__kmp_storage_map) {
260  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
261  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
262 
263  gtid = __kmp_gtid_from_thread(th);
264 
265  if (gtid == KMP_GTID_MONITOR) {
266  __kmp_print_storage_map_gtid(
267  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
268  "th_%s stack (%s)", "mon",
269  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
270  } else {
271  __kmp_print_storage_map_gtid(
272  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
273  "th_%d stack (%s)", gtid,
274  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
275  }
276  }
277 
278  /* No point in checking ubermaster threads since they use refinement and
279  * cannot overlap */
280  gtid = __kmp_gtid_from_thread(th);
281  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
282  KA_TRACE(10,
283  ("__kmp_check_stack_overlap: performing extensive checking\n"));
284  if (stack_beg == NULL) {
285  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
286  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
287  }
288 
289  for (f = 0; f < __kmp_threads_capacity; f++) {
290  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
291 
292  if (f_th && f_th != th) {
293  char *other_stack_end =
294  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295  char *other_stack_beg =
296  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300  /* Print the other stack values before the abort */
301  if (__kmp_storage_map)
302  __kmp_print_storage_map_gtid(
303  -1, other_stack_beg, other_stack_end,
304  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
305  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
306 
307  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
308  __kmp_msg_null);
309  }
310  }
311  }
312  }
313  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 
318 void __kmp_infinite_loop(void) {
319  static int done = FALSE;
320 
321  while (!done) {
322  KMP_YIELD(TRUE);
323  }
324 }
325 
326 #define MAX_MESSAGE 512
327 
328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
329  char const *format, ...) {
330  char buffer[MAX_MESSAGE];
331  va_list ap;
332 
333  va_start(ap, format);
334  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
335  p2, (unsigned long)size, format);
336  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
337  __kmp_vprintf(kmp_err, buffer, ap);
338 #if KMP_PRINT_DATA_PLACEMENT
339  int node;
340  if (gtid >= 0) {
341  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
342  if (__kmp_storage_map_verbose) {
343  node = __kmp_get_host_node(p1);
344  if (node < 0) /* doesn't work, so don't try this next time */
345  __kmp_storage_map_verbose = FALSE;
346  else {
347  char *last;
348  int lastNode;
349  int localProc = __kmp_get_cpu_from_gtid(gtid);
350 
351  const int page_size = KMP_GET_PAGE_SIZE();
352 
353  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
354  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
355  if (localProc >= 0)
356  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
357  localProc >> 1);
358  else
359  __kmp_printf_no_lock(" GTID %d\n", gtid);
360 #if KMP_USE_PRCTL
361  /* The more elaborate format is disabled for now because of the prctl
362  * hanging bug. */
363  do {
364  last = p1;
365  lastNode = node;
366  /* This loop collates adjacent pages with the same host node. */
367  do {
368  (char *)p1 += page_size;
369  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
370  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
371  lastNode);
372  } while (p1 <= p2);
373 #else
374  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
375  (char *)p1 + (page_size - 1),
376  __kmp_get_host_node(p1));
377  if (p1 < p2) {
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
379  (char *)p2 + (page_size - 1),
380  __kmp_get_host_node(p2));
381  }
382 #endif
383  }
384  }
385  } else
386  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
387  }
388 #endif /* KMP_PRINT_DATA_PLACEMENT */
389  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
390 }
391 
392 void __kmp_warn(char const *format, ...) {
393  char buffer[MAX_MESSAGE];
394  va_list ap;
395 
396  if (__kmp_generate_warnings == kmp_warnings_off) {
397  return;
398  }
399 
400  va_start(ap, format);
401 
402  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
403  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
404  __kmp_vprintf(kmp_err, buffer, ap);
405  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
406 
407  va_end(ap);
408 }
409 
410 void __kmp_abort_process() {
411  // Later threads may stall here, but that's ok because abort() will kill them.
412  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
413 
414  if (__kmp_debug_buf) {
415  __kmp_dump_debug_buffer();
416  }
417 
418  if (KMP_OS_WINDOWS) {
419  // Let other threads know of abnormal termination and prevent deadlock
420  // if abort happened during library initialization or shutdown
421  __kmp_global.g.g_abort = SIGABRT;
422 
423  /* On Windows* OS by default abort() causes pop-up error box, which stalls
424  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
425  boxes. _set_abort_behavior() works well, but this function is not
426  available in VS7 (this is not problem for DLL, but it is a problem for
427  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
428  help, at least in some versions of MS C RTL.
429 
430  It seems following sequence is the only way to simulate abort() and
431  avoid pop-up error box. */
432  raise(SIGABRT);
433  _exit(3); // Just in case, if signal ignored, exit anyway.
434  } else {
435  abort();
436  }
437 
438  __kmp_infinite_loop();
439  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
440 
441 } // __kmp_abort_process
442 
443 void __kmp_abort_thread(void) {
444  // TODO: Eliminate g_abort global variable and this function.
445  // In case of abort just call abort(), it will kill all the threads.
446  __kmp_infinite_loop();
447 } // __kmp_abort_thread
448 
449 /* Print out the storage map for the major kmp_info_t thread data structures
450  that are allocated together. */
451 
452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
453  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
454  gtid);
455 
456  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
457  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
458 
459  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
460  sizeof(kmp_local_t), "th_%d.th_local", gtid);
461 
462  __kmp_print_storage_map_gtid(
463  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
464  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
465 
466  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
467  &thr->th.th_bar[bs_plain_barrier + 1],
468  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
469  gtid);
470 
471  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
472  &thr->th.th_bar[bs_forkjoin_barrier + 1],
473  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
474  gtid);
475 
476 #if KMP_FAST_REDUCTION_BARRIER
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
478  &thr->th.th_bar[bs_reduction_barrier + 1],
479  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
480  gtid);
481 #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /* Print out the storage map for the major kmp_team_t team data structures
485  that are allocated together. */
486 
487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
488  int team_id, int num_thr) {
489  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
490  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
491  header, team_id);
492 
493  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
494  &team->t.t_bar[bs_last_barrier],
495  sizeof(kmp_balign_team_t) * bs_last_barrier,
496  "%s_%d.t_bar", header, team_id);
497 
498  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
499  &team->t.t_bar[bs_plain_barrier + 1],
500  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
501  header, team_id);
502 
503  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
504  &team->t.t_bar[bs_forkjoin_barrier + 1],
505  sizeof(kmp_balign_team_t),
506  "%s_%d.t_bar[forkjoin]", header, team_id);
507 
508 #if KMP_FAST_REDUCTION_BARRIER
509  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
510  &team->t.t_bar[bs_reduction_barrier + 1],
511  sizeof(kmp_balign_team_t),
512  "%s_%d.t_bar[reduction]", header, team_id);
513 #endif // KMP_FAST_REDUCTION_BARRIER
514 
515  __kmp_print_storage_map_gtid(
516  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
517  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
518 
519  __kmp_print_storage_map_gtid(
520  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
521  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
522 
523  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
524  &team->t.t_disp_buffer[num_disp_buff],
525  sizeof(dispatch_shared_info_t) * num_disp_buff,
526  "%s_%d.t_disp_buffer", header, team_id);
527 }
528 
529 static void __kmp_init_allocator() { __kmp_init_memkind(); }
530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
531 
532 /* ------------------------------------------------------------------------ */
533 
534 #if KMP_DYNAMIC_LIB
535 #if KMP_OS_WINDOWS
536 
537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
538  // TODO: Change to __kmp_break_bootstrap_lock().
539  __kmp_init_bootstrap_lock(lck); // make the lock released
540 }
541 
542 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
543  int i;
544  int thread_count;
545 
546  // PROCESS_DETACH is expected to be called by a thread that executes
547  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
548  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
549  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
550  // threads can be still alive here, although being about to be terminated. The
551  // threads in the array with ds_thread==0 are most suspicious. Actually, it
552  // can be not safe to access the __kmp_threads[].
553 
554  // TODO: does it make sense to check __kmp_roots[] ?
555 
556  // Let's check that there are no other alive threads registered with the OMP
557  // lib.
558  while (1) {
559  thread_count = 0;
560  for (i = 0; i < __kmp_threads_capacity; ++i) {
561  if (!__kmp_threads)
562  continue;
563  kmp_info_t *th = __kmp_threads[i];
564  if (th == NULL)
565  continue;
566  int gtid = th->th.th_info.ds.ds_gtid;
567  if (gtid == gtid_req)
568  continue;
569  if (gtid < 0)
570  continue;
571  DWORD exit_val;
572  int alive = __kmp_is_thread_alive(th, &exit_val);
573  if (alive) {
574  ++thread_count;
575  }
576  }
577  if (thread_count == 0)
578  break; // success
579  }
580 
581  // Assume that I'm alone. Now it might be safe to check and reset locks.
582  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
583  __kmp_reset_lock(&__kmp_forkjoin_lock);
584 #ifdef KMP_DEBUG
585  __kmp_reset_lock(&__kmp_stdio_lock);
586 #endif // KMP_DEBUG
587 }
588 
589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
590  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
591 
592  switch (fdwReason) {
593 
594  case DLL_PROCESS_ATTACH:
595  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
596 
597  return TRUE;
598 
599  case DLL_PROCESS_DETACH:
600  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
601 
602  if (lpReserved != NULL) {
603  // lpReserved is used for telling the difference:
604  // lpReserved == NULL when FreeLibrary() was called,
605  // lpReserved != NULL when the process terminates.
606  // When FreeLibrary() is called, worker threads remain alive. So they will
607  // release the forkjoin lock by themselves. When the process terminates,
608  // worker threads disappear triggering the problem of unreleased forkjoin
609  // lock as described below.
610 
611  // A worker thread can take the forkjoin lock. The problem comes up if
612  // that worker thread becomes dead before it releases the forkjoin lock.
613  // The forkjoin lock remains taken, while the thread executing
614  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
615  // to take the forkjoin lock and will always fail, so that the application
616  // will never finish [normally]. This scenario is possible if
617  // __kmpc_end() has not been executed. It looks like it's not a corner
618  // case, but common cases:
619  // - the main function was compiled by an alternative compiler;
620  // - the main function was compiled by icl but without /Qopenmp
621  // (application with plugins);
622  // - application terminates by calling C exit(), Fortran CALL EXIT() or
623  // Fortran STOP.
624  // - alive foreign thread prevented __kmpc_end from doing cleanup.
625  //
626  // This is a hack to work around the problem.
627  // TODO: !!! figure out something better.
628  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
629  }
630 
631  __kmp_internal_end_library(__kmp_gtid_get_specific());
632 
633  return TRUE;
634 
635  case DLL_THREAD_ATTACH:
636  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
637 
638  /* if we want to register new siblings all the time here call
639  * __kmp_get_gtid(); */
640  return TRUE;
641 
642  case DLL_THREAD_DETACH:
643  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
644 
645  __kmp_internal_end_thread(__kmp_gtid_get_specific());
646  return TRUE;
647  }
648 
649  return TRUE;
650 }
651 
652 #endif /* KMP_OS_WINDOWS */
653 #endif /* KMP_DYNAMIC_LIB */
654 
655 /* __kmp_parallel_deo -- Wait until it's our turn. */
656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
657  int gtid = *gtid_ref;
658 #ifdef BUILD_PARALLEL_ORDERED
659  kmp_team_t *team = __kmp_team_from_gtid(gtid);
660 #endif /* BUILD_PARALLEL_ORDERED */
661 
662  if (__kmp_env_consistency_check) {
663  if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 #if KMP_USE_DYNAMIC_LOCK
665  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
666 #else
667  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
668 #endif
669  }
670 #ifdef BUILD_PARALLEL_ORDERED
671  if (!team->t.t_serialized) {
672  KMP_MB();
673  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
674  NULL);
675  KMP_MB();
676  }
677 #endif /* BUILD_PARALLEL_ORDERED */
678 }
679 
680 /* __kmp_parallel_dxo -- Signal the next task. */
681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
682  int gtid = *gtid_ref;
683 #ifdef BUILD_PARALLEL_ORDERED
684  int tid = __kmp_tid_from_gtid(gtid);
685  kmp_team_t *team = __kmp_team_from_gtid(gtid);
686 #endif /* BUILD_PARALLEL_ORDERED */
687 
688  if (__kmp_env_consistency_check) {
689  if (__kmp_threads[gtid]->th.th_root->r.r_active)
690  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
691  }
692 #ifdef BUILD_PARALLEL_ORDERED
693  if (!team->t.t_serialized) {
694  KMP_MB(); /* Flush all pending memory write invalidates. */
695 
696  /* use the tid of the next thread in this team */
697  /* TODO replace with general release procedure */
698  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
699 
700  KMP_MB(); /* Flush all pending memory write invalidates. */
701  }
702 #endif /* BUILD_PARALLEL_ORDERED */
703 }
704 
705 /* ------------------------------------------------------------------------ */
706 /* The BARRIER for a SINGLE process section is always explicit */
707 
708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
709  int status;
710  kmp_info_t *th;
711  kmp_team_t *team;
712 
713  if (!TCR_4(__kmp_init_parallel))
714  __kmp_parallel_initialize();
715  __kmp_resume_if_soft_paused();
716 
717  th = __kmp_threads[gtid];
718  team = th->th.th_team;
719  status = 0;
720 
721  th->th.th_ident = id_ref;
722 
723  if (team->t.t_serialized) {
724  status = 1;
725  } else {
726  kmp_int32 old_this = th->th.th_local.this_construct;
727 
728  ++th->th.th_local.this_construct;
729  /* try to set team count to thread count--success means thread got the
730  single block */
731  /* TODO: Should this be acquire or release? */
732  if (team->t.t_construct == old_this) {
733  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
734  th->th.th_local.this_construct);
735  }
736 #if USE_ITT_BUILD
737  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
738  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
739  team->t.t_active_level ==
740  1) { // Only report metadata by master of active team at level 1
741  __kmp_itt_metadata_single(id_ref);
742  }
743 #endif /* USE_ITT_BUILD */
744  }
745 
746  if (__kmp_env_consistency_check) {
747  if (status && push_ws) {
748  __kmp_push_workshare(gtid, ct_psingle, id_ref);
749  } else {
750  __kmp_check_workshare(gtid, ct_psingle, id_ref);
751  }
752  }
753 #if USE_ITT_BUILD
754  if (status) {
755  __kmp_itt_single_start(gtid);
756  }
757 #endif /* USE_ITT_BUILD */
758  return status;
759 }
760 
761 void __kmp_exit_single(int gtid) {
762 #if USE_ITT_BUILD
763  __kmp_itt_single_end(gtid);
764 #endif /* USE_ITT_BUILD */
765  if (__kmp_env_consistency_check)
766  __kmp_pop_workshare(gtid, ct_psingle, NULL);
767 }
768 
769 /* determine if we can go parallel or must use a serialized parallel region and
770  * how many threads we can use
771  * set_nproc is the number of threads requested for the team
772  * returns 0 if we should serialize or only use one thread,
773  * otherwise the number of threads to use
774  * The forkjoin lock is held by the caller. */
775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
776  int master_tid, int set_nthreads,
777  int enter_teams) {
778  int capacity;
779  int new_nthreads;
780  KMP_DEBUG_ASSERT(__kmp_init_serial);
781  KMP_DEBUG_ASSERT(root && parent_team);
782  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
783 
784  // If dyn-var is set, dynamically adjust the number of desired threads,
785  // according to the method specified by dynamic_mode.
786  new_nthreads = set_nthreads;
787  if (!get__dynamic_2(parent_team, master_tid)) {
788  ;
789  }
790 #ifdef USE_LOAD_BALANCE
791  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
792  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
793  if (new_nthreads == 1) {
794  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795  "reservation to 1 thread\n",
796  master_tid));
797  return 1;
798  }
799  if (new_nthreads < set_nthreads) {
800  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
801  "reservation to %d threads\n",
802  master_tid, new_nthreads));
803  }
804  }
805 #endif /* USE_LOAD_BALANCE */
806  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
807  new_nthreads = __kmp_avail_proc - __kmp_nth +
808  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
809  if (new_nthreads <= 1) {
810  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811  "reservation to 1 thread\n",
812  master_tid));
813  return 1;
814  }
815  if (new_nthreads < set_nthreads) {
816  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
817  "reservation to %d threads\n",
818  master_tid, new_nthreads));
819  } else {
820  new_nthreads = set_nthreads;
821  }
822  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
823  if (set_nthreads > 2) {
824  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
825  new_nthreads = (new_nthreads % set_nthreads) + 1;
826  if (new_nthreads == 1) {
827  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828  "reservation to 1 thread\n",
829  master_tid));
830  return 1;
831  }
832  if (new_nthreads < set_nthreads) {
833  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
834  "reservation to %d threads\n",
835  master_tid, new_nthreads));
836  }
837  }
838  } else {
839  KMP_ASSERT(0);
840  }
841 
842  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
843  if (__kmp_nth + new_nthreads -
844  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
845  __kmp_max_nth) {
846  int tl_nthreads = __kmp_max_nth - __kmp_nth +
847  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
848  if (tl_nthreads <= 0) {
849  tl_nthreads = 1;
850  }
851 
852  // If dyn-var is false, emit a 1-time warning.
853  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
854  __kmp_reserve_warn = 1;
855  __kmp_msg(kmp_ms_warning,
856  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
857  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
858  }
859  if (tl_nthreads == 1) {
860  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
861  "reduced reservation to 1 thread\n",
862  master_tid));
863  return 1;
864  }
865  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
866  "reservation to %d threads\n",
867  master_tid, tl_nthreads));
868  new_nthreads = tl_nthreads;
869  }
870 
871  // Respect OMP_THREAD_LIMIT
872  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
873  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
874  if (cg_nthreads + new_nthreads -
875  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
876  max_cg_threads) {
877  int tl_nthreads = max_cg_threads - cg_nthreads +
878  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
879  if (tl_nthreads <= 0) {
880  tl_nthreads = 1;
881  }
882 
883  // If dyn-var is false, emit a 1-time warning.
884  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
885  __kmp_reserve_warn = 1;
886  __kmp_msg(kmp_ms_warning,
887  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
888  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
889  }
890  if (tl_nthreads == 1) {
891  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
892  "reduced reservation to 1 thread\n",
893  master_tid));
894  return 1;
895  }
896  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
897  "reservation to %d threads\n",
898  master_tid, tl_nthreads));
899  new_nthreads = tl_nthreads;
900  }
901 
902  // Check if the threads array is large enough, or needs expanding.
903  // See comment in __kmp_register_root() about the adjustment if
904  // __kmp_threads[0] == NULL.
905  capacity = __kmp_threads_capacity;
906  if (TCR_PTR(__kmp_threads[0]) == NULL) {
907  --capacity;
908  }
909  if (__kmp_nth + new_nthreads -
910  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911  capacity) {
912  // Expand the threads array.
913  int slotsRequired = __kmp_nth + new_nthreads -
914  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915  capacity;
916  int slotsAdded = __kmp_expand_threads(slotsRequired);
917  if (slotsAdded < slotsRequired) {
918  // The threads array was not expanded enough.
919  new_nthreads -= (slotsRequired - slotsAdded);
920  KMP_ASSERT(new_nthreads >= 1);
921 
922  // If dyn-var is false, emit a 1-time warning.
923  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924  __kmp_reserve_warn = 1;
925  if (__kmp_tp_cached) {
926  __kmp_msg(kmp_ms_warning,
927  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930  } else {
931  __kmp_msg(kmp_ms_warning,
932  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934  }
935  }
936  }
937  }
938 
939 #ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941  KC_TRACE(10,
942  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943  "dead roots and rechecking; requested %d threads\n",
944  __kmp_get_gtid(), set_nthreads));
945  } else {
946  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947  " %d threads\n",
948  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950 #endif // KMP_DEBUG
951  return new_nthreads;
952 }
953 
954 /* Allocate threads from the thread pool and assign them to the new team. We are
955  assured that there are enough threads available, because we checked on that
956  earlier within critical section forkjoin */
957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
958  kmp_info_t *master_th, int master_gtid) {
959  int i;
960  int use_hot_team;
961 
962  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
963  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
964  KMP_MB();
965 
966  /* first, let's setup the master thread */
967  master_th->th.th_info.ds.ds_tid = 0;
968  master_th->th.th_team = team;
969  master_th->th.th_team_nproc = team->t.t_nproc;
970  master_th->th.th_team_master = master_th;
971  master_th->th.th_team_serialized = FALSE;
972  master_th->th.th_dispatch = &team->t.t_dispatch[0];
973 
974 /* make sure we are not the optimized hot team */
975 #if KMP_NESTED_HOT_TEAMS
976  use_hot_team = 0;
977  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
978  if (hot_teams) { // hot teams array is not allocated if
979  // KMP_HOT_TEAMS_MAX_LEVEL=0
980  int level = team->t.t_active_level - 1; // index in array of hot teams
981  if (master_th->th.th_teams_microtask) { // are we inside the teams?
982  if (master_th->th.th_teams_size.nteams > 1) {
983  ++level; // level was not increased in teams construct for
984  // team_of_masters
985  }
986  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
987  master_th->th.th_teams_level == team->t.t_level) {
988  ++level; // level was not increased in teams construct for
989  // team_of_workers before the parallel
990  } // team->t.t_level will be increased inside parallel
991  }
992  if (level < __kmp_hot_teams_max_level) {
993  if (hot_teams[level].hot_team) {
994  // hot team has already been allocated for given level
995  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
996  use_hot_team = 1; // the team is ready to use
997  } else {
998  use_hot_team = 0; // AC: threads are not allocated yet
999  hot_teams[level].hot_team = team; // remember new hot team
1000  hot_teams[level].hot_team_nth = team->t.t_nproc;
1001  }
1002  } else {
1003  use_hot_team = 0;
1004  }
1005  }
1006 #else
1007  use_hot_team = team == root->r.r_hot_team;
1008 #endif
1009  if (!use_hot_team) {
1010 
1011  /* install the master thread */
1012  team->t.t_threads[0] = master_th;
1013  __kmp_initialize_info(master_th, team, 0, master_gtid);
1014 
1015  /* now, install the worker threads */
1016  for (i = 1; i < team->t.t_nproc; i++) {
1017 
1018  /* fork or reallocate a new thread and install it in team */
1019  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1020  team->t.t_threads[i] = thr;
1021  KMP_DEBUG_ASSERT(thr);
1022  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1023  /* align team and thread arrived states */
1024  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1025  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1026  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1027  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1028  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1029  team->t.t_bar[bs_plain_barrier].b_arrived));
1030  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1031  thr->th.th_teams_level = master_th->th.th_teams_level;
1032  thr->th.th_teams_size = master_th->th.th_teams_size;
1033  { // Initialize threads' barrier data.
1034  int b;
1035  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1036  for (b = 0; b < bs_last_barrier; ++b) {
1037  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1038  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1039 #if USE_DEBUGGER
1040  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1041 #endif
1042  }
1043  }
1044  }
1045 
1046 #if KMP_AFFINITY_SUPPORTED
1047  __kmp_partition_places(team);
1048 #endif
1049  }
1050 
1051  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1052  for (i = 0; i < team->t.t_nproc; i++) {
1053  kmp_info_t *thr = team->t.t_threads[i];
1054  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1055  thr->th.th_prev_level != team->t.t_level) {
1056  team->t.t_display_affinity = 1;
1057  break;
1058  }
1059  }
1060  }
1061 
1062  KMP_MB();
1063 }
1064 
1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1066 // Propagate any changes to the floating point control registers out to the team
1067 // We try to avoid unnecessary writes to the relevant cache line in the team
1068 // structure, so we don't make changes unless they are needed.
1069 inline static void propagateFPControl(kmp_team_t *team) {
1070  if (__kmp_inherit_fp_control) {
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073 
1074  // Get master values of FPU control flags (both X87 and vector)
1075  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1076  __kmp_store_mxcsr(&mxcsr);
1077  mxcsr &= KMP_X86_MXCSR_MASK;
1078 
1079  // There is no point looking at t_fp_control_saved here.
1080  // If it is TRUE, we still have to update the values if they are different
1081  // from those we now have. If it is FALSE we didn't save anything yet, but
1082  // our objective is the same. We have to ensure that the values in the team
1083  // are the same as those we have.
1084  // So, this code achieves what we need whether or not t_fp_control_saved is
1085  // true. By checking whether the value needs updating we avoid unnecessary
1086  // writes that would put the cache-line into a written state, causing all
1087  // threads in the team to have to read it again.
1088  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1089  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1090  // Although we don't use this value, other code in the runtime wants to know
1091  // whether it should restore them. So we must ensure it is correct.
1092  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1093  } else {
1094  // Similarly here. Don't write to this cache-line in the team structure
1095  // unless we have to.
1096  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1097  }
1098 }
1099 
1100 // Do the opposite, setting the hardware registers to the updated values from
1101 // the team.
1102 inline static void updateHWFPControl(kmp_team_t *team) {
1103  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1104  // Only reset the fp control regs if they have been changed in the team.
1105  // the parallel region that we are exiting.
1106  kmp_int16 x87_fpu_control_word;
1107  kmp_uint32 mxcsr;
1108  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1109  __kmp_store_mxcsr(&mxcsr);
1110  mxcsr &= KMP_X86_MXCSR_MASK;
1111 
1112  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1113  __kmp_clear_x87_fpu_status_word();
1114  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1115  }
1116 
1117  if (team->t.t_mxcsr != mxcsr) {
1118  __kmp_load_mxcsr(&team->t.t_mxcsr);
1119  }
1120  }
1121 }
1122 #else
1123 #define propagateFPControl(x) ((void)0)
1124 #define updateHWFPControl(x) ((void)0)
1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1126 
1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1128  int realloc); // forward declaration
1129 
1130 /* Run a parallel region that has been serialized, so runs only in a team of the
1131  single master thread. */
1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1133  kmp_info_t *this_thr;
1134  kmp_team_t *serial_team;
1135 
1136  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1137 
1138  /* Skip all this code for autopar serialized loops since it results in
1139  unacceptable overhead */
1140  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1141  return;
1142 
1143  if (!TCR_4(__kmp_init_parallel))
1144  __kmp_parallel_initialize();
1145  __kmp_resume_if_soft_paused();
1146 
1147  this_thr = __kmp_threads[global_tid];
1148  serial_team = this_thr->th.th_serial_team;
1149 
1150  /* utilize the serialized team held by this thread */
1151  KMP_DEBUG_ASSERT(serial_team);
1152  KMP_MB();
1153 
1154  if (__kmp_tasking_mode != tskm_immediate_exec) {
1155  KMP_DEBUG_ASSERT(
1156  this_thr->th.th_task_team ==
1157  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1158  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1159  NULL);
1160  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1161  "team %p, new task_team = NULL\n",
1162  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1163  this_thr->th.th_task_team = NULL;
1164  }
1165 
1166  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1167  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1168  proc_bind = proc_bind_false;
1169  } else if (proc_bind == proc_bind_default) {
1170  // No proc_bind clause was specified, so use the current value
1171  // of proc-bind-var for this parallel region.
1172  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1173  }
1174  // Reset for next parallel region
1175  this_thr->th.th_set_proc_bind = proc_bind_default;
1176 
1177 #if OMPT_SUPPORT
1178  ompt_data_t ompt_parallel_data = ompt_data_none;
1179  ompt_data_t *implicit_task_data;
1180  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1181  if (ompt_enabled.enabled &&
1182  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1183 
1184  ompt_task_info_t *parent_task_info;
1185  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1186 
1187  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1188  if (ompt_enabled.ompt_callback_parallel_begin) {
1189  int team_size = 1;
1190 
1191  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1192  &(parent_task_info->task_data), &(parent_task_info->frame),
1193  &ompt_parallel_data, team_size,
1194  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1195  }
1196  }
1197 #endif // OMPT_SUPPORT
1198 
1199  if (this_thr->th.th_team != serial_team) {
1200  // Nested level will be an index in the nested nthreads array
1201  int level = this_thr->th.th_team->t.t_level;
1202 
1203  if (serial_team->t.t_serialized) {
1204  /* this serial team was already used
1205  TODO increase performance by making this locks more specific */
1206  kmp_team_t *new_team;
1207 
1208  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1209 
1210  new_team =
1211  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1212 #if OMPT_SUPPORT
1213  ompt_parallel_data,
1214 #endif
1215  proc_bind, &this_thr->th.th_current_task->td_icvs,
1216  0 USE_NESTED_HOT_ARG(NULL));
1217  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1218  KMP_ASSERT(new_team);
1219 
1220  /* setup new serialized team and install it */
1221  new_team->t.t_threads[0] = this_thr;
1222  new_team->t.t_parent = this_thr->th.th_team;
1223  serial_team = new_team;
1224  this_thr->th.th_serial_team = serial_team;
1225 
1226  KF_TRACE(
1227  10,
1228  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1229  global_tid, serial_team));
1230 
1231  /* TODO the above breaks the requirement that if we run out of resources,
1232  then we can still guarantee that serialized teams are ok, since we may
1233  need to allocate a new one */
1234  } else {
1235  KF_TRACE(
1236  10,
1237  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1238  global_tid, serial_team));
1239  }
1240 
1241  /* we have to initialize this serial team */
1242  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1243  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1244  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1245  serial_team->t.t_ident = loc;
1246  serial_team->t.t_serialized = 1;
1247  serial_team->t.t_nproc = 1;
1248  serial_team->t.t_parent = this_thr->th.th_team;
1249  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1250  this_thr->th.th_team = serial_team;
1251  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1252 
1253  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1254  this_thr->th.th_current_task));
1255  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1256  this_thr->th.th_current_task->td_flags.executing = 0;
1257 
1258  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1259 
1260  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1261  implicit task for each serialized task represented by
1262  team->t.t_serialized? */
1263  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1264  &this_thr->th.th_current_task->td_parent->td_icvs);
1265 
1266  // Thread value exists in the nested nthreads array for the next nested
1267  // level
1268  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1269  this_thr->th.th_current_task->td_icvs.nproc =
1270  __kmp_nested_nth.nth[level + 1];
1271  }
1272 
1273  if (__kmp_nested_proc_bind.used &&
1274  (level + 1 < __kmp_nested_proc_bind.used)) {
1275  this_thr->th.th_current_task->td_icvs.proc_bind =
1276  __kmp_nested_proc_bind.bind_types[level + 1];
1277  }
1278 
1279 #if USE_DEBUGGER
1280  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1281 #endif
1282  this_thr->th.th_info.ds.ds_tid = 0;
1283 
1284  /* set thread cache values */
1285  this_thr->th.th_team_nproc = 1;
1286  this_thr->th.th_team_master = this_thr;
1287  this_thr->th.th_team_serialized = 1;
1288 
1289  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1290  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1291  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1292 
1293  propagateFPControl(serial_team);
1294 
1295  /* check if we need to allocate dispatch buffers stack */
1296  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1297  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1298  serial_team->t.t_dispatch->th_disp_buffer =
1299  (dispatch_private_info_t *)__kmp_allocate(
1300  sizeof(dispatch_private_info_t));
1301  }
1302  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1303 
1304  KMP_MB();
1305 
1306  } else {
1307  /* this serialized team is already being used,
1308  * that's fine, just add another nested level */
1309  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1310  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1311  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1312  ++serial_team->t.t_serialized;
1313  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1314 
1315  // Nested level will be an index in the nested nthreads array
1316  int level = this_thr->th.th_team->t.t_level;
1317  // Thread value exists in the nested nthreads array for the next nested
1318  // level
1319  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1320  this_thr->th.th_current_task->td_icvs.nproc =
1321  __kmp_nested_nth.nth[level + 1];
1322  }
1323  serial_team->t.t_level++;
1324  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1325  "of serial team %p to %d\n",
1326  global_tid, serial_team, serial_team->t.t_level));
1327 
1328  /* allocate/push dispatch buffers stack */
1329  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1330  {
1331  dispatch_private_info_t *disp_buffer =
1332  (dispatch_private_info_t *)__kmp_allocate(
1333  sizeof(dispatch_private_info_t));
1334  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1335  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1336  }
1337  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1338 
1339  KMP_MB();
1340  }
1341  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1342 
1343  // Perform the display affinity functionality for
1344  // serialized parallel regions
1345  if (__kmp_display_affinity) {
1346  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1347  this_thr->th.th_prev_num_threads != 1) {
1348  // NULL means use the affinity-format-var ICV
1349  __kmp_aux_display_affinity(global_tid, NULL);
1350  this_thr->th.th_prev_level = serial_team->t.t_level;
1351  this_thr->th.th_prev_num_threads = 1;
1352  }
1353  }
1354 
1355  if (__kmp_env_consistency_check)
1356  __kmp_push_parallel(global_tid, NULL);
1357 #if OMPT_SUPPORT
1358  serial_team->t.ompt_team_info.master_return_address = codeptr;
1359  if (ompt_enabled.enabled &&
1360  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1361  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1362 
1363  ompt_lw_taskteam_t lw_taskteam;
1364  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1365  &ompt_parallel_data, codeptr);
1366 
1367  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1368  // don't use lw_taskteam after linking. content was swaped
1369 
1370  /* OMPT implicit task begin */
1371  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1372  if (ompt_enabled.ompt_callback_implicit_task) {
1373  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1374  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1375  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1376  OMPT_CUR_TASK_INFO(this_thr)
1377  ->thread_num = __kmp_tid_from_gtid(global_tid);
1378  }
1379 
1380  /* OMPT state */
1381  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1382  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1383  }
1384 #endif
1385 }
1386 
1387 /* most of the work for a fork */
1388 /* return true if we really went parallel, false if serialized */
1389 int __kmp_fork_call(ident_t *loc, int gtid,
1390  enum fork_context_e call_context, // Intel, GNU, ...
1391  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1392  kmp_va_list ap) {
1393  void **argv;
1394  int i;
1395  int master_tid;
1396  int master_this_cons;
1397  kmp_team_t *team;
1398  kmp_team_t *parent_team;
1399  kmp_info_t *master_th;
1400  kmp_root_t *root;
1401  int nthreads;
1402  int master_active;
1403  int master_set_numthreads;
1404  int level;
1405  int active_level;
1406  int teams_level;
1407 #if KMP_NESTED_HOT_TEAMS
1408  kmp_hot_team_ptr_t **p_hot_teams;
1409 #endif
1410  { // KMP_TIME_BLOCK
1411  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1412  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1413 
1414  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1415  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1416  /* Some systems prefer the stack for the root thread(s) to start with */
1417  /* some gap from the parent stack to prevent false sharing. */
1418  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1419  /* These 2 lines below are so this does not get optimized out */
1420  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1421  __kmp_stkpadding += (short)((kmp_int64)dummy);
1422  }
1423 
1424  /* initialize if needed */
1425  KMP_DEBUG_ASSERT(
1426  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1427  if (!TCR_4(__kmp_init_parallel))
1428  __kmp_parallel_initialize();
1429  __kmp_resume_if_soft_paused();
1430 
1431  /* setup current data */
1432  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1433  // shutdown
1434  parent_team = master_th->th.th_team;
1435  master_tid = master_th->th.th_info.ds.ds_tid;
1436  master_this_cons = master_th->th.th_local.this_construct;
1437  root = master_th->th.th_root;
1438  master_active = root->r.r_active;
1439  master_set_numthreads = master_th->th.th_set_nproc;
1440 
1441 #if OMPT_SUPPORT
1442  ompt_data_t ompt_parallel_data = ompt_data_none;
1443  ompt_data_t *parent_task_data;
1444  ompt_frame_t *ompt_frame;
1445  ompt_data_t *implicit_task_data;
1446  void *return_address = NULL;
1447 
1448  if (ompt_enabled.enabled) {
1449  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1450  NULL, NULL);
1451  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1452  }
1453 #endif
1454 
1455  // Nested level will be an index in the nested nthreads array
1456  level = parent_team->t.t_level;
1457  // used to launch non-serial teams even if nested is not allowed
1458  active_level = parent_team->t.t_active_level;
1459  // needed to check nesting inside the teams
1460  teams_level = master_th->th.th_teams_level;
1461 #if KMP_NESTED_HOT_TEAMS
1462  p_hot_teams = &master_th->th.th_hot_teams;
1463  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1464  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1465  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467  // it is either actual or not needed (when active_level > 0)
1468  (*p_hot_teams)[0].hot_team_nth = 1;
1469  }
1470 #endif
1471 
1472 #if OMPT_SUPPORT
1473  if (ompt_enabled.enabled) {
1474  if (ompt_enabled.ompt_callback_parallel_begin) {
1475  int team_size = master_set_numthreads
1476  ? master_set_numthreads
1477  : get__nproc_2(parent_team, master_tid);
1478  int flags = OMPT_INVOKER(call_context) |
1479  ((microtask == (microtask_t)__kmp_teams_master)
1480  ? ompt_parallel_league
1481  : ompt_parallel_team);
1482  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1483  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1484  return_address);
1485  }
1486  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1487  }
1488 #endif
1489 
1490  master_th->th.th_ident = loc;
1491 
1492  if (master_th->th.th_teams_microtask && ap &&
1493  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1494  // AC: This is start of parallel that is nested inside teams construct.
1495  // The team is actual (hot), all workers are ready at the fork barrier.
1496  // No lock needed to initialize the team a bit, then free workers.
1497  parent_team->t.t_ident = loc;
1498  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1499  parent_team->t.t_argc = argc;
1500  argv = (void **)parent_team->t.t_argv;
1501  for (i = argc - 1; i >= 0; --i)
1502  *argv++ = va_arg(kmp_va_deref(ap), void *);
1503  // Increment our nested depth levels, but not increase the serialization
1504  if (parent_team == master_th->th.th_serial_team) {
1505  // AC: we are in serialized parallel
1506  __kmpc_serialized_parallel(loc, gtid);
1507  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1508 
1509  if (call_context == fork_context_gnu) {
1510  // AC: need to decrement t_serialized for enquiry functions to work
1511  // correctly, will restore at join time
1512  parent_team->t.t_serialized--;
1513  return TRUE;
1514  }
1515 
1516 #if OMPT_SUPPORT
1517  void *dummy;
1518  void **exit_frame_p;
1519 
1520  ompt_lw_taskteam_t lw_taskteam;
1521 
1522  if (ompt_enabled.enabled) {
1523  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1524  &ompt_parallel_data, return_address);
1525  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1526 
1527  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1528  // don't use lw_taskteam after linking. content was swaped
1529 
1530  /* OMPT implicit task begin */
1531  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1532  if (ompt_enabled.ompt_callback_implicit_task) {
1533  OMPT_CUR_TASK_INFO(master_th)
1534  ->thread_num = __kmp_tid_from_gtid(gtid);
1535  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1536  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1537  implicit_task_data, 1,
1538  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1539  }
1540 
1541  /* OMPT state */
1542  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1543  } else {
1544  exit_frame_p = &dummy;
1545  }
1546 #endif
1547  // AC: need to decrement t_serialized for enquiry functions to work
1548  // correctly, will restore at join time
1549  parent_team->t.t_serialized--;
1550 
1551  {
1552  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1553  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1554  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1555 #if OMPT_SUPPORT
1556  ,
1557  exit_frame_p
1558 #endif
1559  );
1560  }
1561 
1562 #if OMPT_SUPPORT
1563  if (ompt_enabled.enabled) {
1564  *exit_frame_p = NULL;
1565  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1566  if (ompt_enabled.ompt_callback_implicit_task) {
1567  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1568  ompt_scope_end, NULL, implicit_task_data, 1,
1569  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1570  }
1571  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1572  __ompt_lw_taskteam_unlink(master_th);
1573  if (ompt_enabled.ompt_callback_parallel_end) {
1574  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1575  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1576  OMPT_INVOKER(call_context) | ompt_parallel_team,
1577  return_address);
1578  }
1579  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1580  }
1581 #endif
1582  return TRUE;
1583  }
1584 
1585  parent_team->t.t_pkfn = microtask;
1586  parent_team->t.t_invoke = invoker;
1587  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1588  parent_team->t.t_active_level++;
1589  parent_team->t.t_level++;
1590  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1591 
1592 #if OMPT_SUPPORT
1593  if (ompt_enabled.enabled) {
1594  ompt_lw_taskteam_t lw_taskteam;
1595  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1596  &ompt_parallel_data, return_address);
1597  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1598  }
1599 #endif
1600 
1601  /* Change number of threads in the team if requested */
1602  if (master_set_numthreads) { // The parallel has num_threads clause
1603  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1604  // AC: only can reduce number of threads dynamically, can't increase
1605  kmp_info_t **other_threads = parent_team->t.t_threads;
1606  parent_team->t.t_nproc = master_set_numthreads;
1607  for (i = 0; i < master_set_numthreads; ++i) {
1608  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1609  }
1610  // Keep extra threads hot in the team for possible next parallels
1611  }
1612  master_th->th.th_set_nproc = 0;
1613  }
1614 
1615 #if USE_DEBUGGER
1616  if (__kmp_debugging) { // Let debugger override number of threads.
1617  int nth = __kmp_omp_num_threads(loc);
1618  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1619  master_set_numthreads = nth;
1620  }
1621  }
1622 #endif
1623 
1624 #if USE_ITT_BUILD
1625  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1626  KMP_ITT_DEBUG) &&
1627  __kmp_forkjoin_frames_mode == 3 &&
1628  parent_team->t.t_active_level == 1 // only report frames at level 1
1629  && master_th->th.th_teams_size.nteams == 1) {
1630  kmp_uint64 tmp_time = __itt_get_timestamp();
1631  master_th->th.th_frame_time = tmp_time;
1632  parent_team->t.t_region_time = tmp_time;
1633  }
1634  if (__itt_stack_caller_create_ptr) {
1635  // create new stack stitching id before entering fork barrier
1636  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1637  }
1638 #endif /* USE_ITT_BUILD */
1639 
1640  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1641  "master_th=%p, gtid=%d\n",
1642  root, parent_team, master_th, gtid));
1643  __kmp_internal_fork(loc, gtid, parent_team);
1644  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1645  "master_th=%p, gtid=%d\n",
1646  root, parent_team, master_th, gtid));
1647 
1648  if (call_context == fork_context_gnu)
1649  return TRUE;
1650 
1651  /* Invoke microtask for MASTER thread */
1652  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1653  parent_team->t.t_id, parent_team->t.t_pkfn));
1654 
1655  if (!parent_team->t.t_invoke(gtid)) {
1656  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1657  }
1658  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1659  parent_team->t.t_id, parent_team->t.t_pkfn));
1660  KMP_MB(); /* Flush all pending memory write invalidates. */
1661 
1662  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1663 
1664  return TRUE;
1665  } // Parallel closely nested in teams construct
1666 
1667 #if KMP_DEBUG
1668  if (__kmp_tasking_mode != tskm_immediate_exec) {
1669  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1670  parent_team->t.t_task_team[master_th->th.th_task_state]);
1671  }
1672 #endif
1673 
1674  if (parent_team->t.t_active_level >=
1675  master_th->th.th_current_task->td_icvs.max_active_levels) {
1676  nthreads = 1;
1677  } else {
1678  int enter_teams = ((ap == NULL && active_level == 0) ||
1679  (ap && teams_level > 0 && teams_level == level));
1680  nthreads =
1681  master_set_numthreads
1682  ? master_set_numthreads
1683  : get__nproc_2(
1684  parent_team,
1685  master_tid); // TODO: get nproc directly from current task
1686 
1687  // Check if we need to take forkjoin lock? (no need for serialized
1688  // parallel out of teams construct). This code moved here from
1689  // __kmp_reserve_threads() to speedup nested serialized parallels.
1690  if (nthreads > 1) {
1691  if ((get__max_active_levels(master_th) == 1 &&
1692  (root->r.r_in_parallel && !enter_teams)) ||
1693  (__kmp_library == library_serial)) {
1694  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1695  " threads\n",
1696  gtid, nthreads));
1697  nthreads = 1;
1698  }
1699  }
1700  if (nthreads > 1) {
1701  /* determine how many new threads we can use */
1702  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1703  /* AC: If we execute teams from parallel region (on host), then teams
1704  should be created but each can only have 1 thread if nesting is
1705  disabled. If teams called from serial region, then teams and their
1706  threads should be created regardless of the nesting setting. */
1707  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1708  nthreads, enter_teams);
1709  if (nthreads == 1) {
1710  // Free lock for single thread execution here; for multi-thread
1711  // execution it will be freed later after team of threads created
1712  // and initialized
1713  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1714  }
1715  }
1716  }
1717  KMP_DEBUG_ASSERT(nthreads > 0);
1718 
1719  // If we temporarily changed the set number of threads then restore it now
1720  master_th->th.th_set_nproc = 0;
1721 
1722  /* create a serialized parallel region? */
1723  if (nthreads == 1) {
1724 /* josh todo: hypothetical question: what do we do for OS X*? */
1725 #if KMP_OS_LINUX && \
1726  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1727  void *args[argc];
1728 #else
1729  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1730 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1731  KMP_ARCH_AARCH64) */
1732 
1733  KA_TRACE(20,
1734  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1735 
1736  __kmpc_serialized_parallel(loc, gtid);
1737 
1738  if (call_context == fork_context_intel) {
1739  /* TODO this sucks, use the compiler itself to pass args! :) */
1740  master_th->th.th_serial_team->t.t_ident = loc;
1741  if (!ap) {
1742  // revert change made in __kmpc_serialized_parallel()
1743  master_th->th.th_serial_team->t.t_level--;
1744 // Get args from parent team for teams construct
1745 
1746 #if OMPT_SUPPORT
1747  void *dummy;
1748  void **exit_frame_p;
1749  ompt_task_info_t *task_info;
1750 
1751  ompt_lw_taskteam_t lw_taskteam;
1752 
1753  if (ompt_enabled.enabled) {
1754  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1755  &ompt_parallel_data, return_address);
1756 
1757  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1758  // don't use lw_taskteam after linking. content was swaped
1759 
1760  task_info = OMPT_CUR_TASK_INFO(master_th);
1761  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1762  if (ompt_enabled.ompt_callback_implicit_task) {
1763  OMPT_CUR_TASK_INFO(master_th)
1764  ->thread_num = __kmp_tid_from_gtid(gtid);
1765  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1766  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1767  &(task_info->task_data), 1,
1768  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1769  ompt_task_implicit);
1770  }
1771 
1772  /* OMPT state */
1773  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1774  } else {
1775  exit_frame_p = &dummy;
1776  }
1777 #endif
1778 
1779  {
1780  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1781  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1782  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1783  parent_team->t.t_argv
1784 #if OMPT_SUPPORT
1785  ,
1786  exit_frame_p
1787 #endif
1788  );
1789  }
1790 
1791 #if OMPT_SUPPORT
1792  if (ompt_enabled.enabled) {
1793  *exit_frame_p = NULL;
1794  if (ompt_enabled.ompt_callback_implicit_task) {
1795  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1796  ompt_scope_end, NULL, &(task_info->task_data), 1,
1797  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1798  ompt_task_implicit);
1799  }
1800  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1801  __ompt_lw_taskteam_unlink(master_th);
1802  if (ompt_enabled.ompt_callback_parallel_end) {
1803  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1804  &ompt_parallel_data, parent_task_data,
1805  OMPT_INVOKER(call_context) | ompt_parallel_team,
1806  return_address);
1807  }
1808  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1809  }
1810 #endif
1811  } else if (microtask == (microtask_t)__kmp_teams_master) {
1812  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1813  master_th->th.th_serial_team);
1814  team = master_th->th.th_team;
1815  // team->t.t_pkfn = microtask;
1816  team->t.t_invoke = invoker;
1817  __kmp_alloc_argv_entries(argc, team, TRUE);
1818  team->t.t_argc = argc;
1819  argv = (void **)team->t.t_argv;
1820  if (ap) {
1821  for (i = argc - 1; i >= 0; --i)
1822  *argv++ = va_arg(kmp_va_deref(ap), void *);
1823  } else {
1824  for (i = 0; i < argc; ++i)
1825  // Get args from parent team for teams construct
1826  argv[i] = parent_team->t.t_argv[i];
1827  }
1828  // AC: revert change made in __kmpc_serialized_parallel()
1829  // because initial code in teams should have level=0
1830  team->t.t_level--;
1831  // AC: call special invoker for outer "parallel" of teams construct
1832  invoker(gtid);
1833 #if OMPT_SUPPORT
1834  if (ompt_enabled.enabled) {
1835  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1836  if (ompt_enabled.ompt_callback_implicit_task) {
1837  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1838  ompt_scope_end, NULL, &(task_info->task_data), 0,
1839  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1840  }
1841  if (ompt_enabled.ompt_callback_parallel_end) {
1842  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1843  &ompt_parallel_data, parent_task_data,
1844  OMPT_INVOKER(call_context) | ompt_parallel_league,
1845  return_address);
1846  }
1847  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1848  }
1849 #endif
1850  } else {
1851  argv = args;
1852  for (i = argc - 1; i >= 0; --i)
1853  *argv++ = va_arg(kmp_va_deref(ap), void *);
1854  KMP_MB();
1855 
1856 #if OMPT_SUPPORT
1857  void *dummy;
1858  void **exit_frame_p;
1859  ompt_task_info_t *task_info;
1860 
1861  ompt_lw_taskteam_t lw_taskteam;
1862 
1863  if (ompt_enabled.enabled) {
1864  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1865  &ompt_parallel_data, return_address);
1866  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1867  // don't use lw_taskteam after linking. content was swaped
1868  task_info = OMPT_CUR_TASK_INFO(master_th);
1869  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1870 
1871  /* OMPT implicit task begin */
1872  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1873  if (ompt_enabled.ompt_callback_implicit_task) {
1874  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1875  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1876  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1877  ompt_task_implicit);
1878  OMPT_CUR_TASK_INFO(master_th)
1879  ->thread_num = __kmp_tid_from_gtid(gtid);
1880  }
1881 
1882  /* OMPT state */
1883  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1884  } else {
1885  exit_frame_p = &dummy;
1886  }
1887 #endif
1888 
1889  {
1890  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1891  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1892  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1893 #if OMPT_SUPPORT
1894  ,
1895  exit_frame_p
1896 #endif
1897  );
1898  }
1899 
1900 #if OMPT_SUPPORT
1901  if (ompt_enabled.enabled) {
1902  *exit_frame_p = NULL;
1903  if (ompt_enabled.ompt_callback_implicit_task) {
1904  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1905  ompt_scope_end, NULL, &(task_info->task_data), 1,
1906  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1907  ompt_task_implicit);
1908  }
1909 
1910  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1911  __ompt_lw_taskteam_unlink(master_th);
1912  if (ompt_enabled.ompt_callback_parallel_end) {
1913  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1914  &ompt_parallel_data, parent_task_data,
1915  OMPT_INVOKER(call_context) | ompt_parallel_team,
1916  return_address);
1917  }
1918  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1919  }
1920 #endif
1921  }
1922  } else if (call_context == fork_context_gnu) {
1923 #if OMPT_SUPPORT
1924  ompt_lw_taskteam_t lwt;
1925  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1926  return_address);
1927 
1928  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1929  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1930 // don't use lw_taskteam after linking. content was swaped
1931 #endif
1932 
1933  // we were called from GNU native code
1934  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1935  return FALSE;
1936  } else {
1937  KMP_ASSERT2(call_context < fork_context_last,
1938  "__kmp_fork_call: unknown fork_context parameter");
1939  }
1940 
1941  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1942  KMP_MB();
1943  return FALSE;
1944  } // if (nthreads == 1)
1945 
1946  // GEH: only modify the executing flag in the case when not serialized
1947  // serialized case is handled in kmpc_serialized_parallel
1948  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1949  "curtask=%p, curtask_max_aclevel=%d\n",
1950  parent_team->t.t_active_level, master_th,
1951  master_th->th.th_current_task,
1952  master_th->th.th_current_task->td_icvs.max_active_levels));
1953  // TODO: GEH - cannot do this assertion because root thread not set up as
1954  // executing
1955  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1956  master_th->th.th_current_task->td_flags.executing = 0;
1957 
1958  if (!master_th->th.th_teams_microtask || level > teams_level) {
1959  /* Increment our nested depth level */
1960  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1961  }
1962 
1963  // See if we need to make a copy of the ICVs.
1964  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1965  if ((level + 1 < __kmp_nested_nth.used) &&
1966  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1967  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1968  } else {
1969  nthreads_icv = 0; // don't update
1970  }
1971 
1972  // Figure out the proc_bind_policy for the new team.
1973  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1974  kmp_proc_bind_t proc_bind_icv =
1975  proc_bind_default; // proc_bind_default means don't update
1976  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1977  proc_bind = proc_bind_false;
1978  } else {
1979  if (proc_bind == proc_bind_default) {
1980  // No proc_bind clause specified; use current proc-bind-var for this
1981  // parallel region
1982  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1983  }
1984  /* else: The proc_bind policy was specified explicitly on parallel clause.
1985  This overrides proc-bind-var for this parallel region, but does not
1986  change proc-bind-var. */
1987  // Figure the value of proc-bind-var for the child threads.
1988  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1989  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1990  master_th->th.th_current_task->td_icvs.proc_bind)) {
1991  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1992  }
1993  }
1994 
1995  // Reset for next parallel region
1996  master_th->th.th_set_proc_bind = proc_bind_default;
1997 
1998  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1999  kmp_internal_control_t new_icvs;
2000  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2001  new_icvs.next = NULL;
2002  if (nthreads_icv > 0) {
2003  new_icvs.nproc = nthreads_icv;
2004  }
2005  if (proc_bind_icv != proc_bind_default) {
2006  new_icvs.proc_bind = proc_bind_icv;
2007  }
2008 
2009  /* allocate a new parallel team */
2010  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2011  team = __kmp_allocate_team(root, nthreads, nthreads,
2012 #if OMPT_SUPPORT
2013  ompt_parallel_data,
2014 #endif
2015  proc_bind, &new_icvs,
2016  argc USE_NESTED_HOT_ARG(master_th));
2017  } else {
2018  /* allocate a new parallel team */
2019  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020  team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022  ompt_parallel_data,
2023 #endif
2024  proc_bind,
2025  &master_th->th.th_current_task->td_icvs,
2026  argc USE_NESTED_HOT_ARG(master_th));
2027  }
2028  KF_TRACE(
2029  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2030 
2031  /* setup the new team */
2032  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2033  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2034  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2035  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2036  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2037 #if OMPT_SUPPORT
2038  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2039  return_address);
2040 #endif
2041  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2042  // TODO: parent_team->t.t_level == INT_MAX ???
2043  if (!master_th->th.th_teams_microtask || level > teams_level) {
2044  int new_level = parent_team->t.t_level + 1;
2045  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2046  new_level = parent_team->t.t_active_level + 1;
2047  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2048  } else {
2049  // AC: Do not increase parallel level at start of the teams construct
2050  int new_level = parent_team->t.t_level;
2051  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2052  new_level = parent_team->t.t_active_level;
2053  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2054  }
2055  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2056  // set master's schedule as new run-time schedule
2057  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2058 
2059  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2060  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2061 
2062  // Update the floating point rounding in the team if required.
2063  propagateFPControl(team);
2064 
2065  if (__kmp_tasking_mode != tskm_immediate_exec) {
2066  // Set master's task team to team's task team. Unless this is hot team, it
2067  // should be NULL.
2068  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2069  parent_team->t.t_task_team[master_th->th.th_task_state]);
2070  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2071  "%p, new task_team %p / team %p\n",
2072  __kmp_gtid_from_thread(master_th),
2073  master_th->th.th_task_team, parent_team,
2074  team->t.t_task_team[master_th->th.th_task_state], team));
2075 
2076  if (active_level || master_th->th.th_task_team) {
2077  // Take a memo of master's task_state
2078  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2079  if (master_th->th.th_task_state_top >=
2080  master_th->th.th_task_state_stack_sz) { // increase size
2081  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2082  kmp_uint8 *old_stack, *new_stack;
2083  kmp_uint32 i;
2084  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2085  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2086  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2087  }
2088  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2089  ++i) { // zero-init rest of stack
2090  new_stack[i] = 0;
2091  }
2092  old_stack = master_th->th.th_task_state_memo_stack;
2093  master_th->th.th_task_state_memo_stack = new_stack;
2094  master_th->th.th_task_state_stack_sz = new_size;
2095  __kmp_free(old_stack);
2096  }
2097  // Store master's task_state on stack
2098  master_th->th
2099  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2100  master_th->th.th_task_state;
2101  master_th->th.th_task_state_top++;
2102 #if KMP_NESTED_HOT_TEAMS
2103  if (master_th->th.th_hot_teams &&
2104  active_level < __kmp_hot_teams_max_level &&
2105  team == master_th->th.th_hot_teams[active_level].hot_team) {
2106  // Restore master's nested state if nested hot team
2107  master_th->th.th_task_state =
2108  master_th->th
2109  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2110  } else {
2111 #endif
2112  master_th->th.th_task_state = 0;
2113 #if KMP_NESTED_HOT_TEAMS
2114  }
2115 #endif
2116  }
2117 #if !KMP_NESTED_HOT_TEAMS
2118  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2119  (team == root->r.r_hot_team));
2120 #endif
2121  }
2122 
2123  KA_TRACE(
2124  20,
2125  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2126  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2127  team->t.t_nproc));
2128  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2129  (team->t.t_master_tid == 0 &&
2130  (team->t.t_parent == root->r.r_root_team ||
2131  team->t.t_parent->t.t_serialized)));
2132  KMP_MB();
2133 
2134  /* now, setup the arguments */
2135  argv = (void **)team->t.t_argv;
2136  if (ap) {
2137  for (i = argc - 1; i >= 0; --i) {
2138  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2139  KMP_CHECK_UPDATE(*argv, new_argv);
2140  argv++;
2141  }
2142  } else {
2143  for (i = 0; i < argc; ++i) {
2144  // Get args from parent team for teams construct
2145  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2146  }
2147  }
2148 
2149  /* now actually fork the threads */
2150  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2151  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2152  root->r.r_active = TRUE;
2153 
2154  __kmp_fork_team_threads(root, team, master_th, gtid);
2155  __kmp_setup_icv_copy(team, nthreads,
2156  &master_th->th.th_current_task->td_icvs, loc);
2157 
2158 #if OMPT_SUPPORT
2159  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2160 #endif
2161 
2162  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2163 
2164 #if USE_ITT_BUILD
2165  if (team->t.t_active_level == 1 // only report frames at level 1
2166  && !master_th->th.th_teams_microtask) { // not in teams construct
2167 #if USE_ITT_NOTIFY
2168  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2169  (__kmp_forkjoin_frames_mode == 3 ||
2170  __kmp_forkjoin_frames_mode == 1)) {
2171  kmp_uint64 tmp_time = 0;
2172  if (__itt_get_timestamp_ptr)
2173  tmp_time = __itt_get_timestamp();
2174  // Internal fork - report frame begin
2175  master_th->th.th_frame_time = tmp_time;
2176  if (__kmp_forkjoin_frames_mode == 3)
2177  team->t.t_region_time = tmp_time;
2178  } else
2179 // only one notification scheme (either "submit" or "forking/joined", not both)
2180 #endif /* USE_ITT_NOTIFY */
2181  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2182  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2183  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2184  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2185  }
2186  }
2187 #endif /* USE_ITT_BUILD */
2188 
2189  /* now go on and do the work */
2190  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2191  KMP_MB();
2192  KF_TRACE(10,
2193  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2194  root, team, master_th, gtid));
2195 
2196 #if USE_ITT_BUILD
2197  if (__itt_stack_caller_create_ptr) {
2198  team->t.t_stack_id =
2199  __kmp_itt_stack_caller_create(); // create new stack stitching id
2200  // before entering fork barrier
2201  }
2202 #endif /* USE_ITT_BUILD */
2203 
2204  // AC: skip __kmp_internal_fork at teams construct, let only master
2205  // threads execute
2206  if (ap) {
2207  __kmp_internal_fork(loc, gtid, team);
2208  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2209  "master_th=%p, gtid=%d\n",
2210  root, team, master_th, gtid));
2211  }
2212 
2213  if (call_context == fork_context_gnu) {
2214  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215  return TRUE;
2216  }
2217 
2218  /* Invoke microtask for MASTER thread */
2219  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2220  team->t.t_id, team->t.t_pkfn));
2221  } // END of timer KMP_fork_call block
2222 
2223 #if KMP_STATS_ENABLED
2224  // If beginning a teams construct, then change thread state
2225  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2226  if (!ap) {
2227  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2228  }
2229 #endif
2230 
2231  if (!team->t.t_invoke(gtid)) {
2232  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2233  }
2234 
2235 #if KMP_STATS_ENABLED
2236  // If was beginning of a teams construct, then reset thread state
2237  if (!ap) {
2238  KMP_SET_THREAD_STATE(previous_state);
2239  }
2240 #endif
2241 
2242  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2243  team->t.t_id, team->t.t_pkfn));
2244  KMP_MB(); /* Flush all pending memory write invalidates. */
2245 
2246  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2247 
2248 #if OMPT_SUPPORT
2249  if (ompt_enabled.enabled) {
2250  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2251  }
2252 #endif
2253 
2254  return TRUE;
2255 }
2256 
2257 #if OMPT_SUPPORT
2258 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2259  kmp_team_t *team) {
2260  // restore state outside the region
2261  thread->th.ompt_thread_info.state =
2262  ((team->t.t_serialized) ? ompt_state_work_serial
2263  : ompt_state_work_parallel);
2264 }
2265 
2266 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2267  kmp_team_t *team, ompt_data_t *parallel_data,
2268  int flags, void *codeptr) {
2269  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2270  if (ompt_enabled.ompt_callback_parallel_end) {
2271  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2272  parallel_data, &(task_info->task_data), flags, codeptr);
2273  }
2274 
2275  task_info->frame.enter_frame = ompt_data_none;
2276  __kmp_join_restore_state(thread, team);
2277 }
2278 #endif
2279 
2280 void __kmp_join_call(ident_t *loc, int gtid
2281 #if OMPT_SUPPORT
2282  ,
2283  enum fork_context_e fork_context
2284 #endif
2285  ,
2286  int exit_teams) {
2287  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2288  kmp_team_t *team;
2289  kmp_team_t *parent_team;
2290  kmp_info_t *master_th;
2291  kmp_root_t *root;
2292  int master_active;
2293 
2294  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2295 
2296  /* setup current data */
2297  master_th = __kmp_threads[gtid];
2298  root = master_th->th.th_root;
2299  team = master_th->th.th_team;
2300  parent_team = team->t.t_parent;
2301 
2302  master_th->th.th_ident = loc;
2303 
2304 #if OMPT_SUPPORT
2305  void *team_microtask = (void *)team->t.t_pkfn;
2306  // For GOMP interface with serialized parallel, need the
2307  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2308  // and end-parallel events.
2309  if (ompt_enabled.enabled &&
2310  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2311  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2312  }
2313 #endif
2314 
2315 #if KMP_DEBUG
2316  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2317  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2318  "th_task_team = %p\n",
2319  __kmp_gtid_from_thread(master_th), team,
2320  team->t.t_task_team[master_th->th.th_task_state],
2321  master_th->th.th_task_team));
2322  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2323  team->t.t_task_team[master_th->th.th_task_state]);
2324  }
2325 #endif
2326 
2327  if (team->t.t_serialized) {
2328  if (master_th->th.th_teams_microtask) {
2329  // We are in teams construct
2330  int level = team->t.t_level;
2331  int tlevel = master_th->th.th_teams_level;
2332  if (level == tlevel) {
2333  // AC: we haven't incremented it earlier at start of teams construct,
2334  // so do it here - at the end of teams construct
2335  team->t.t_level++;
2336  } else if (level == tlevel + 1) {
2337  // AC: we are exiting parallel inside teams, need to increment
2338  // serialization in order to restore it in the next call to
2339  // __kmpc_end_serialized_parallel
2340  team->t.t_serialized++;
2341  }
2342  }
2343  __kmpc_end_serialized_parallel(loc, gtid);
2344 
2345 #if OMPT_SUPPORT
2346  if (ompt_enabled.enabled) {
2347  __kmp_join_restore_state(master_th, parent_team);
2348  }
2349 #endif
2350 
2351  return;
2352  }
2353 
2354  master_active = team->t.t_master_active;
2355 
2356  if (!exit_teams) {
2357  // AC: No barrier for internal teams at exit from teams construct.
2358  // But there is barrier for external team (league).
2359  __kmp_internal_join(loc, gtid, team);
2360  } else {
2361  master_th->th.th_task_state =
2362  0; // AC: no tasking in teams (out of any parallel)
2363  }
2364 
2365  KMP_MB();
2366 
2367 #if OMPT_SUPPORT
2368  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2369  void *codeptr = team->t.ompt_team_info.master_return_address;
2370 #endif
2371 
2372 #if USE_ITT_BUILD
2373  if (__itt_stack_caller_create_ptr) {
2374  // destroy the stack stitching id after join barrier
2375  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2376  }
2377  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2378  if (team->t.t_active_level == 1 &&
2379  (!master_th->th.th_teams_microtask || /* not in teams construct */
2380  master_th->th.th_teams_size.nteams == 1)) {
2381  master_th->th.th_ident = loc;
2382  // only one notification scheme (either "submit" or "forking/joined", not
2383  // both)
2384  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2385  __kmp_forkjoin_frames_mode == 3)
2386  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2387  master_th->th.th_frame_time, 0, loc,
2388  master_th->th.th_team_nproc, 1);
2389  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2390  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2391  __kmp_itt_region_joined(gtid);
2392  } // active_level == 1
2393 #endif /* USE_ITT_BUILD */
2394 
2395  if (master_th->th.th_teams_microtask && !exit_teams &&
2396  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2397  team->t.t_level == master_th->th.th_teams_level + 1) {
2398 // AC: We need to leave the team structure intact at the end of parallel
2399 // inside the teams construct, so that at the next parallel same (hot) team
2400 // works, only adjust nesting levels
2401 #if OMPT_SUPPORT
2402  ompt_data_t ompt_parallel_data = ompt_data_none;
2403  if (ompt_enabled.enabled) {
2404  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2405  if (ompt_enabled.ompt_callback_implicit_task) {
2406  int ompt_team_size = team->t.t_nproc;
2407  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2408  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2409  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2410  }
2411  task_info->frame.exit_frame = ompt_data_none;
2412  task_info->task_data = ompt_data_none;
2413  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2414  __ompt_lw_taskteam_unlink(master_th);
2415  }
2416 #endif
2417  /* Decrement our nested depth level */
2418  team->t.t_level--;
2419  team->t.t_active_level--;
2420  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2421 
2422  // Restore number of threads in the team if needed. This code relies on
2423  // the proper adjustment of th_teams_size.nth after the fork in
2424  // __kmp_teams_master on each teams master in the case that
2425  // __kmp_reserve_threads reduced it.
2426  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2427  int old_num = master_th->th.th_team_nproc;
2428  int new_num = master_th->th.th_teams_size.nth;
2429  kmp_info_t **other_threads = team->t.t_threads;
2430  team->t.t_nproc = new_num;
2431  for (int i = 0; i < old_num; ++i) {
2432  other_threads[i]->th.th_team_nproc = new_num;
2433  }
2434  // Adjust states of non-used threads of the team
2435  for (int i = old_num; i < new_num; ++i) {
2436  // Re-initialize thread's barrier data.
2437  KMP_DEBUG_ASSERT(other_threads[i]);
2438  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2439  for (int b = 0; b < bs_last_barrier; ++b) {
2440  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2441  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2442 #if USE_DEBUGGER
2443  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2444 #endif
2445  }
2446  if (__kmp_tasking_mode != tskm_immediate_exec) {
2447  // Synchronize thread's task state
2448  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2449  }
2450  }
2451  }
2452 
2453 #if OMPT_SUPPORT
2454  if (ompt_enabled.enabled) {
2455  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2456  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2457  }
2458 #endif
2459 
2460  return;
2461  }
2462 
2463  /* do cleanup and restore the parent team */
2464  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2465  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2466 
2467  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2468 
2469  /* jc: The following lock has instructions with REL and ACQ semantics,
2470  separating the parallel user code called in this parallel region
2471  from the serial user code called after this function returns. */
2472  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2473 
2474  if (!master_th->th.th_teams_microtask ||
2475  team->t.t_level > master_th->th.th_teams_level) {
2476  /* Decrement our nested depth level */
2477  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2478  }
2479  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2480 
2481 #if OMPT_SUPPORT
2482  if (ompt_enabled.enabled) {
2483  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2484  if (ompt_enabled.ompt_callback_implicit_task) {
2485  int flags = (team_microtask == (void *)__kmp_teams_master)
2486  ? ompt_task_initial
2487  : ompt_task_implicit;
2488  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2489  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2490  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2491  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2492  }
2493  task_info->frame.exit_frame = ompt_data_none;
2494  task_info->task_data = ompt_data_none;
2495  }
2496 #endif
2497 
2498  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2499  master_th, team));
2500  __kmp_pop_current_task_from_thread(master_th);
2501 
2502 #if KMP_AFFINITY_SUPPORTED
2503  // Restore master thread's partition.
2504  master_th->th.th_first_place = team->t.t_first_place;
2505  master_th->th.th_last_place = team->t.t_last_place;
2506 #endif // KMP_AFFINITY_SUPPORTED
2507  master_th->th.th_def_allocator = team->t.t_def_allocator;
2508 
2509  updateHWFPControl(team);
2510 
2511  if (root->r.r_active != master_active)
2512  root->r.r_active = master_active;
2513 
2514  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2515  master_th)); // this will free worker threads
2516 
2517  /* this race was fun to find. make sure the following is in the critical
2518  region otherwise assertions may fail occasionally since the old team may be
2519  reallocated and the hierarchy appears inconsistent. it is actually safe to
2520  run and won't cause any bugs, but will cause those assertion failures. it's
2521  only one deref&assign so might as well put this in the critical region */
2522  master_th->th.th_team = parent_team;
2523  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2524  master_th->th.th_team_master = parent_team->t.t_threads[0];
2525  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2526 
2527  /* restore serialized team, if need be */
2528  if (parent_team->t.t_serialized &&
2529  parent_team != master_th->th.th_serial_team &&
2530  parent_team != root->r.r_root_team) {
2531  __kmp_free_team(root,
2532  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2533  master_th->th.th_serial_team = parent_team;
2534  }
2535 
2536  if (__kmp_tasking_mode != tskm_immediate_exec) {
2537  if (master_th->th.th_task_state_top >
2538  0) { // Restore task state from memo stack
2539  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2540  // Remember master's state if we re-use this nested hot team
2541  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2542  master_th->th.th_task_state;
2543  --master_th->th.th_task_state_top; // pop
2544  // Now restore state at this level
2545  master_th->th.th_task_state =
2546  master_th->th
2547  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2548  }
2549  // Copy the task team from the parent team to the master thread
2550  master_th->th.th_task_team =
2551  parent_team->t.t_task_team[master_th->th.th_task_state];
2552  KA_TRACE(20,
2553  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2554  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2555  parent_team));
2556  }
2557 
2558  // TODO: GEH - cannot do this assertion because root thread not set up as
2559  // executing
2560  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2561  master_th->th.th_current_task->td_flags.executing = 1;
2562 
2563  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2564 
2565 #if OMPT_SUPPORT
2566  int flags =
2567  OMPT_INVOKER(fork_context) |
2568  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2569  : ompt_parallel_team);
2570  if (ompt_enabled.enabled) {
2571  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2572  codeptr);
2573  }
2574 #endif
2575 
2576  KMP_MB();
2577  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2578 }
2579 
2580 /* Check whether we should push an internal control record onto the
2581  serial team stack. If so, do it. */
2582 void __kmp_save_internal_controls(kmp_info_t *thread) {
2583 
2584  if (thread->th.th_team != thread->th.th_serial_team) {
2585  return;
2586  }
2587  if (thread->th.th_team->t.t_serialized > 1) {
2588  int push = 0;
2589 
2590  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2591  push = 1;
2592  } else {
2593  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2594  thread->th.th_team->t.t_serialized) {
2595  push = 1;
2596  }
2597  }
2598  if (push) { /* push a record on the serial team's stack */
2599  kmp_internal_control_t *control =
2600  (kmp_internal_control_t *)__kmp_allocate(
2601  sizeof(kmp_internal_control_t));
2602 
2603  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2604 
2605  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2606 
2607  control->next = thread->th.th_team->t.t_control_stack_top;
2608  thread->th.th_team->t.t_control_stack_top = control;
2609  }
2610  }
2611 }
2612 
2613 /* Changes set_nproc */
2614 void __kmp_set_num_threads(int new_nth, int gtid) {
2615  kmp_info_t *thread;
2616  kmp_root_t *root;
2617 
2618  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2619  KMP_DEBUG_ASSERT(__kmp_init_serial);
2620 
2621  if (new_nth < 1)
2622  new_nth = 1;
2623  else if (new_nth > __kmp_max_nth)
2624  new_nth = __kmp_max_nth;
2625 
2626  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2627  thread = __kmp_threads[gtid];
2628  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2629  return; // nothing to do
2630 
2631  __kmp_save_internal_controls(thread);
2632 
2633  set__nproc(thread, new_nth);
2634 
2635  // If this omp_set_num_threads() call will cause the hot team size to be
2636  // reduced (in the absence of a num_threads clause), then reduce it now,
2637  // rather than waiting for the next parallel region.
2638  root = thread->th.th_root;
2639  if (__kmp_init_parallel && (!root->r.r_active) &&
2640  (root->r.r_hot_team->t.t_nproc > new_nth)
2641 #if KMP_NESTED_HOT_TEAMS
2642  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2643 #endif
2644  ) {
2645  kmp_team_t *hot_team = root->r.r_hot_team;
2646  int f;
2647 
2648  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2649 
2650  // Release the extra threads we don't need any more.
2651  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2652  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2653  if (__kmp_tasking_mode != tskm_immediate_exec) {
2654  // When decreasing team size, threads no longer in the team should unref
2655  // task team.
2656  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2657  }
2658  __kmp_free_thread(hot_team->t.t_threads[f]);
2659  hot_team->t.t_threads[f] = NULL;
2660  }
2661  hot_team->t.t_nproc = new_nth;
2662 #if KMP_NESTED_HOT_TEAMS
2663  if (thread->th.th_hot_teams) {
2664  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2665  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2666  }
2667 #endif
2668 
2669  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2670 
2671  // Update the t_nproc field in the threads that are still active.
2672  for (f = 0; f < new_nth; f++) {
2673  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2674  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2675  }
2676  // Special flag in case omp_set_num_threads() call
2677  hot_team->t.t_size_changed = -1;
2678  }
2679 }
2680 
2681 /* Changes max_active_levels */
2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2683  kmp_info_t *thread;
2684 
2685  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2686  "%d = (%d)\n",
2687  gtid, max_active_levels));
2688  KMP_DEBUG_ASSERT(__kmp_init_serial);
2689 
2690  // validate max_active_levels
2691  if (max_active_levels < 0) {
2692  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2693  // We ignore this call if the user has specified a negative value.
2694  // The current setting won't be changed. The last valid setting will be
2695  // used. A warning will be issued (if warnings are allowed as controlled by
2696  // the KMP_WARNINGS env var).
2697  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2698  "max_active_levels for thread %d = (%d)\n",
2699  gtid, max_active_levels));
2700  return;
2701  }
2702  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2703  // it's OK, the max_active_levels is within the valid range: [ 0;
2704  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2705  // We allow a zero value. (implementation defined behavior)
2706  } else {
2707  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2708  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2709  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2710  // Current upper limit is MAX_INT. (implementation defined behavior)
2711  // If the input exceeds the upper limit, we correct the input to be the
2712  // upper limit. (implementation defined behavior)
2713  // Actually, the flow should never get here until we use MAX_INT limit.
2714  }
2715  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2716  "max_active_levels for thread %d = (%d)\n",
2717  gtid, max_active_levels));
2718 
2719  thread = __kmp_threads[gtid];
2720 
2721  __kmp_save_internal_controls(thread);
2722 
2723  set__max_active_levels(thread, max_active_levels);
2724 }
2725 
2726 /* Gets max_active_levels */
2727 int __kmp_get_max_active_levels(int gtid) {
2728  kmp_info_t *thread;
2729 
2730  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2731  KMP_DEBUG_ASSERT(__kmp_init_serial);
2732 
2733  thread = __kmp_threads[gtid];
2734  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2735  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2736  "curtask_maxaclevel=%d\n",
2737  gtid, thread->th.th_current_task,
2738  thread->th.th_current_task->td_icvs.max_active_levels));
2739  return thread->th.th_current_task->td_icvs.max_active_levels;
2740 }
2741 
2742 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2743 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2744 
2745 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2746 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2747  kmp_info_t *thread;
2748  kmp_sched_t orig_kind;
2749  // kmp_team_t *team;
2750 
2751  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2752  gtid, (int)kind, chunk));
2753  KMP_DEBUG_ASSERT(__kmp_init_serial);
2754 
2755  // Check if the kind parameter is valid, correct if needed.
2756  // Valid parameters should fit in one of two intervals - standard or extended:
2757  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2758  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2759  orig_kind = kind;
2760  kind = __kmp_sched_without_mods(kind);
2761 
2762  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2763  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2764  // TODO: Hint needs attention in case we change the default schedule.
2765  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2766  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2767  __kmp_msg_null);
2768  kind = kmp_sched_default;
2769  chunk = 0; // ignore chunk value in case of bad kind
2770  }
2771 
2772  thread = __kmp_threads[gtid];
2773 
2774  __kmp_save_internal_controls(thread);
2775 
2776  if (kind < kmp_sched_upper_std) {
2777  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2778  // differ static chunked vs. unchunked: chunk should be invalid to
2779  // indicate unchunked schedule (which is the default)
2780  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2781  } else {
2782  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2783  __kmp_sch_map[kind - kmp_sched_lower - 1];
2784  }
2785  } else {
2786  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2787  // kmp_sched_lower - 2 ];
2788  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2789  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2790  kmp_sched_lower - 2];
2791  }
2792  __kmp_sched_apply_mods_intkind(
2793  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2794  if (kind == kmp_sched_auto || chunk < 1) {
2795  // ignore parameter chunk for schedule auto
2796  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2797  } else {
2798  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2799  }
2800 }
2801 
2802 /* Gets def_sched_var ICV values */
2803 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2804  kmp_info_t *thread;
2805  enum sched_type th_type;
2806 
2807  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2808  KMP_DEBUG_ASSERT(__kmp_init_serial);
2809 
2810  thread = __kmp_threads[gtid];
2811 
2812  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2813  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2814  case kmp_sch_static:
2815  case kmp_sch_static_greedy:
2816  case kmp_sch_static_balanced:
2817  *kind = kmp_sched_static;
2818  __kmp_sched_apply_mods_stdkind(kind, th_type);
2819  *chunk = 0; // chunk was not set, try to show this fact via zero value
2820  return;
2821  case kmp_sch_static_chunked:
2822  *kind = kmp_sched_static;
2823  break;
2824  case kmp_sch_dynamic_chunked:
2825  *kind = kmp_sched_dynamic;
2826  break;
2828  case kmp_sch_guided_iterative_chunked:
2829  case kmp_sch_guided_analytical_chunked:
2830  *kind = kmp_sched_guided;
2831  break;
2832  case kmp_sch_auto:
2833  *kind = kmp_sched_auto;
2834  break;
2835  case kmp_sch_trapezoidal:
2836  *kind = kmp_sched_trapezoidal;
2837  break;
2838 #if KMP_STATIC_STEAL_ENABLED
2839  case kmp_sch_static_steal:
2840  *kind = kmp_sched_static_steal;
2841  break;
2842 #endif
2843  default:
2844  KMP_FATAL(UnknownSchedulingType, th_type);
2845  }
2846 
2847  __kmp_sched_apply_mods_stdkind(kind, th_type);
2848  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2849 }
2850 
2851 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2852 
2853  int ii, dd;
2854  kmp_team_t *team;
2855  kmp_info_t *thr;
2856 
2857  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2858  KMP_DEBUG_ASSERT(__kmp_init_serial);
2859 
2860  // validate level
2861  if (level == 0)
2862  return 0;
2863  if (level < 0)
2864  return -1;
2865  thr = __kmp_threads[gtid];
2866  team = thr->th.th_team;
2867  ii = team->t.t_level;
2868  if (level > ii)
2869  return -1;
2870 
2871  if (thr->th.th_teams_microtask) {
2872  // AC: we are in teams region where multiple nested teams have same level
2873  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2874  if (level <=
2875  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2876  KMP_DEBUG_ASSERT(ii >= tlevel);
2877  // AC: As we need to pass by the teams league, we need to artificially
2878  // increase ii
2879  if (ii == tlevel) {
2880  ii += 2; // three teams have same level
2881  } else {
2882  ii++; // two teams have same level
2883  }
2884  }
2885  }
2886 
2887  if (ii == level)
2888  return __kmp_tid_from_gtid(gtid);
2889 
2890  dd = team->t.t_serialized;
2891  level++;
2892  while (ii > level) {
2893  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2894  }
2895  if ((team->t.t_serialized) && (!dd)) {
2896  team = team->t.t_parent;
2897  continue;
2898  }
2899  if (ii > level) {
2900  team = team->t.t_parent;
2901  dd = team->t.t_serialized;
2902  ii--;
2903  }
2904  }
2905 
2906  return (dd > 1) ? (0) : (team->t.t_master_tid);
2907 }
2908 
2909 int __kmp_get_team_size(int gtid, int level) {
2910 
2911  int ii, dd;
2912  kmp_team_t *team;
2913  kmp_info_t *thr;
2914 
2915  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2916  KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918  // validate level
2919  if (level == 0)
2920  return 1;
2921  if (level < 0)
2922  return -1;
2923  thr = __kmp_threads[gtid];
2924  team = thr->th.th_team;
2925  ii = team->t.t_level;
2926  if (level > ii)
2927  return -1;
2928 
2929  if (thr->th.th_teams_microtask) {
2930  // AC: we are in teams region where multiple nested teams have same level
2931  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2932  if (level <=
2933  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2934  KMP_DEBUG_ASSERT(ii >= tlevel);
2935  // AC: As we need to pass by the teams league, we need to artificially
2936  // increase ii
2937  if (ii == tlevel) {
2938  ii += 2; // three teams have same level
2939  } else {
2940  ii++; // two teams have same level
2941  }
2942  }
2943  }
2944 
2945  while (ii > level) {
2946  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2947  }
2948  if (team->t.t_serialized && (!dd)) {
2949  team = team->t.t_parent;
2950  continue;
2951  }
2952  if (ii > level) {
2953  team = team->t.t_parent;
2954  ii--;
2955  }
2956  }
2957 
2958  return team->t.t_nproc;
2959 }
2960 
2961 kmp_r_sched_t __kmp_get_schedule_global() {
2962  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2963  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2964  // independently. So one can get the updated schedule here.
2965 
2966  kmp_r_sched_t r_sched;
2967 
2968  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2969  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2970  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2971  // different roots (even in OMP 2.5)
2972  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2973  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2974  if (s == kmp_sch_static) {
2975  // replace STATIC with more detailed schedule (balanced or greedy)
2976  r_sched.r_sched_type = __kmp_static;
2977  } else if (s == kmp_sch_guided_chunked) {
2978  // replace GUIDED with more detailed schedule (iterative or analytical)
2979  r_sched.r_sched_type = __kmp_guided;
2980  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2981  r_sched.r_sched_type = __kmp_sched;
2982  }
2983  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2984 
2985  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2986  // __kmp_chunk may be wrong here (if it was not ever set)
2987  r_sched.chunk = KMP_DEFAULT_CHUNK;
2988  } else {
2989  r_sched.chunk = __kmp_chunk;
2990  }
2991 
2992  return r_sched;
2993 }
2994 
2995 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2996  at least argc number of *t_argv entries for the requested team. */
2997 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2998 
2999  KMP_DEBUG_ASSERT(team);
3000  if (!realloc || argc > team->t.t_max_argc) {
3001 
3002  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3003  "current entries=%d\n",
3004  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3005  /* if previously allocated heap space for args, free them */
3006  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3007  __kmp_free((void *)team->t.t_argv);
3008 
3009  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3010  /* use unused space in the cache line for arguments */
3011  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3012  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3013  "argv entries\n",
3014  team->t.t_id, team->t.t_max_argc));
3015  team->t.t_argv = &team->t.t_inline_argv[0];
3016  if (__kmp_storage_map) {
3017  __kmp_print_storage_map_gtid(
3018  -1, &team->t.t_inline_argv[0],
3019  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3020  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3021  team->t.t_id);
3022  }
3023  } else {
3024  /* allocate space for arguments in the heap */
3025  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3026  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3027  : 2 * argc;
3028  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3029  "argv entries\n",
3030  team->t.t_id, team->t.t_max_argc));
3031  team->t.t_argv =
3032  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3033  if (__kmp_storage_map) {
3034  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3035  &team->t.t_argv[team->t.t_max_argc],
3036  sizeof(void *) * team->t.t_max_argc,
3037  "team_%d.t_argv", team->t.t_id);
3038  }
3039  }
3040  }
3041 }
3042 
3043 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3044  int i;
3045  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3046  team->t.t_threads =
3047  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3048  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3049  sizeof(dispatch_shared_info_t) * num_disp_buff);
3050  team->t.t_dispatch =
3051  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3052  team->t.t_implicit_task_taskdata =
3053  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3054  team->t.t_max_nproc = max_nth;
3055 
3056  /* setup dispatch buffers */
3057  for (i = 0; i < num_disp_buff; ++i) {
3058  team->t.t_disp_buffer[i].buffer_index = i;
3059  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3060  }
3061 }
3062 
3063 static void __kmp_free_team_arrays(kmp_team_t *team) {
3064  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3065  int i;
3066  for (i = 0; i < team->t.t_max_nproc; ++i) {
3067  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3068  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3069  team->t.t_dispatch[i].th_disp_buffer = NULL;
3070  }
3071  }
3072 #if KMP_USE_HIER_SCHED
3073  __kmp_dispatch_free_hierarchies(team);
3074 #endif
3075  __kmp_free(team->t.t_threads);
3076  __kmp_free(team->t.t_disp_buffer);
3077  __kmp_free(team->t.t_dispatch);
3078  __kmp_free(team->t.t_implicit_task_taskdata);
3079  team->t.t_threads = NULL;
3080  team->t.t_disp_buffer = NULL;
3081  team->t.t_dispatch = NULL;
3082  team->t.t_implicit_task_taskdata = 0;
3083 }
3084 
3085 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3086  kmp_info_t **oldThreads = team->t.t_threads;
3087 
3088  __kmp_free(team->t.t_disp_buffer);
3089  __kmp_free(team->t.t_dispatch);
3090  __kmp_free(team->t.t_implicit_task_taskdata);
3091  __kmp_allocate_team_arrays(team, max_nth);
3092 
3093  KMP_MEMCPY(team->t.t_threads, oldThreads,
3094  team->t.t_nproc * sizeof(kmp_info_t *));
3095 
3096  __kmp_free(oldThreads);
3097 }
3098 
3099 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3100 
3101  kmp_r_sched_t r_sched =
3102  __kmp_get_schedule_global(); // get current state of scheduling globals
3103 
3104  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3105 
3106  kmp_internal_control_t g_icvs = {
3107  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3108  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3109  // adjustment of threads (per thread)
3110  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3111  // whether blocktime is explicitly set
3112  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3113 #if KMP_USE_MONITOR
3114  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3115 // intervals
3116 #endif
3117  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3118  // next parallel region (per thread)
3119  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3120  __kmp_cg_max_nth, // int thread_limit;
3121  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3122  // for max_active_levels
3123  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3124  // {sched,chunk} pair
3125  __kmp_nested_proc_bind.bind_types[0],
3126  __kmp_default_device,
3127  NULL // struct kmp_internal_control *next;
3128  };
3129 
3130  return g_icvs;
3131 }
3132 
3133 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3134 
3135  kmp_internal_control_t gx_icvs;
3136  gx_icvs.serial_nesting_level =
3137  0; // probably =team->t.t_serial like in save_inter_controls
3138  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3139  gx_icvs.next = NULL;
3140 
3141  return gx_icvs;
3142 }
3143 
3144 static void __kmp_initialize_root(kmp_root_t *root) {
3145  int f;
3146  kmp_team_t *root_team;
3147  kmp_team_t *hot_team;
3148  int hot_team_max_nth;
3149  kmp_r_sched_t r_sched =
3150  __kmp_get_schedule_global(); // get current state of scheduling globals
3151  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3152  KMP_DEBUG_ASSERT(root);
3153  KMP_ASSERT(!root->r.r_begin);
3154 
3155  /* setup the root state structure */
3156  __kmp_init_lock(&root->r.r_begin_lock);
3157  root->r.r_begin = FALSE;
3158  root->r.r_active = FALSE;
3159  root->r.r_in_parallel = 0;
3160  root->r.r_blocktime = __kmp_dflt_blocktime;
3161 
3162  /* setup the root team for this task */
3163  /* allocate the root team structure */
3164  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3165 
3166  root_team =
3167  __kmp_allocate_team(root,
3168  1, // new_nproc
3169  1, // max_nproc
3170 #if OMPT_SUPPORT
3171  ompt_data_none, // root parallel id
3172 #endif
3173  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3174  0 // argc
3175  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3176  );
3177 #if USE_DEBUGGER
3178  // Non-NULL value should be assigned to make the debugger display the root
3179  // team.
3180  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3181 #endif
3182 
3183  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3184 
3185  root->r.r_root_team = root_team;
3186  root_team->t.t_control_stack_top = NULL;
3187 
3188  /* initialize root team */
3189  root_team->t.t_threads[0] = NULL;
3190  root_team->t.t_nproc = 1;
3191  root_team->t.t_serialized = 1;
3192  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3193  root_team->t.t_sched.sched = r_sched.sched;
3194  KA_TRACE(
3195  20,
3196  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3197  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3198 
3199  /* setup the hot team for this task */
3200  /* allocate the hot team structure */
3201  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3202 
3203  hot_team =
3204  __kmp_allocate_team(root,
3205  1, // new_nproc
3206  __kmp_dflt_team_nth_ub * 2, // max_nproc
3207 #if OMPT_SUPPORT
3208  ompt_data_none, // root parallel id
3209 #endif
3210  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3211  0 // argc
3212  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3213  );
3214  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3215 
3216  root->r.r_hot_team = hot_team;
3217  root_team->t.t_control_stack_top = NULL;
3218 
3219  /* first-time initialization */
3220  hot_team->t.t_parent = root_team;
3221 
3222  /* initialize hot team */
3223  hot_team_max_nth = hot_team->t.t_max_nproc;
3224  for (f = 0; f < hot_team_max_nth; ++f) {
3225  hot_team->t.t_threads[f] = NULL;
3226  }
3227  hot_team->t.t_nproc = 1;
3228  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3229  hot_team->t.t_sched.sched = r_sched.sched;
3230  hot_team->t.t_size_changed = 0;
3231 }
3232 
3233 #ifdef KMP_DEBUG
3234 
3235 typedef struct kmp_team_list_item {
3236  kmp_team_p const *entry;
3237  struct kmp_team_list_item *next;
3238 } kmp_team_list_item_t;
3239 typedef kmp_team_list_item_t *kmp_team_list_t;
3240 
3241 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3242  kmp_team_list_t list, // List of teams.
3243  kmp_team_p const *team // Team to add.
3244  ) {
3245 
3246  // List must terminate with item where both entry and next are NULL.
3247  // Team is added to the list only once.
3248  // List is sorted in ascending order by team id.
3249  // Team id is *not* a key.
3250 
3251  kmp_team_list_t l;
3252 
3253  KMP_DEBUG_ASSERT(list != NULL);
3254  if (team == NULL) {
3255  return;
3256  }
3257 
3258  __kmp_print_structure_team_accum(list, team->t.t_parent);
3259  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3260 
3261  // Search list for the team.
3262  l = list;
3263  while (l->next != NULL && l->entry != team) {
3264  l = l->next;
3265  }
3266  if (l->next != NULL) {
3267  return; // Team has been added before, exit.
3268  }
3269 
3270  // Team is not found. Search list again for insertion point.
3271  l = list;
3272  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3273  l = l->next;
3274  }
3275 
3276  // Insert team.
3277  {
3278  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3279  sizeof(kmp_team_list_item_t));
3280  *item = *l;
3281  l->entry = team;
3282  l->next = item;
3283  }
3284 }
3285 
3286 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3287 
3288  ) {
3289  __kmp_printf("%s", title);
3290  if (team != NULL) {
3291  __kmp_printf("%2x %p\n", team->t.t_id, team);
3292  } else {
3293  __kmp_printf(" - (nil)\n");
3294  }
3295 }
3296 
3297 static void __kmp_print_structure_thread(char const *title,
3298  kmp_info_p const *thread) {
3299  __kmp_printf("%s", title);
3300  if (thread != NULL) {
3301  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3302  } else {
3303  __kmp_printf(" - (nil)\n");
3304  }
3305 }
3306 
3307 void __kmp_print_structure(void) {
3308 
3309  kmp_team_list_t list;
3310 
3311  // Initialize list of teams.
3312  list =
3313  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3314  list->entry = NULL;
3315  list->next = NULL;
3316 
3317  __kmp_printf("\n------------------------------\nGlobal Thread "
3318  "Table\n------------------------------\n");
3319  {
3320  int gtid;
3321  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3322  __kmp_printf("%2d", gtid);
3323  if (__kmp_threads != NULL) {
3324  __kmp_printf(" %p", __kmp_threads[gtid]);
3325  }
3326  if (__kmp_root != NULL) {
3327  __kmp_printf(" %p", __kmp_root[gtid]);
3328  }
3329  __kmp_printf("\n");
3330  }
3331  }
3332 
3333  // Print out __kmp_threads array.
3334  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3335  "----------\n");
3336  if (__kmp_threads != NULL) {
3337  int gtid;
3338  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3339  kmp_info_t const *thread = __kmp_threads[gtid];
3340  if (thread != NULL) {
3341  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3342  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3343  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3344  __kmp_print_structure_team(" Serial Team: ",
3345  thread->th.th_serial_team);
3346  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3347  __kmp_print_structure_thread(" Master: ",
3348  thread->th.th_team_master);
3349  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3350  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3351  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3352  __kmp_print_structure_thread(" Next in pool: ",
3353  thread->th.th_next_pool);
3354  __kmp_printf("\n");
3355  __kmp_print_structure_team_accum(list, thread->th.th_team);
3356  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3357  }
3358  }
3359  } else {
3360  __kmp_printf("Threads array is not allocated.\n");
3361  }
3362 
3363  // Print out __kmp_root array.
3364  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3365  "--------\n");
3366  if (__kmp_root != NULL) {
3367  int gtid;
3368  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3369  kmp_root_t const *root = __kmp_root[gtid];
3370  if (root != NULL) {
3371  __kmp_printf("GTID %2d %p:\n", gtid, root);
3372  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3373  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3374  __kmp_print_structure_thread(" Uber Thread: ",
3375  root->r.r_uber_thread);
3376  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3377  __kmp_printf(" In Parallel: %2d\n",
3378  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3379  __kmp_printf("\n");
3380  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3381  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3382  }
3383  }
3384  } else {
3385  __kmp_printf("Ubers array is not allocated.\n");
3386  }
3387 
3388  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3389  "--------\n");
3390  while (list->next != NULL) {
3391  kmp_team_p const *team = list->entry;
3392  int i;
3393  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3394  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3395  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3396  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3397  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3398  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3399  for (i = 0; i < team->t.t_nproc; ++i) {
3400  __kmp_printf(" Thread %2d: ", i);
3401  __kmp_print_structure_thread("", team->t.t_threads[i]);
3402  }
3403  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3404  __kmp_printf("\n");
3405  list = list->next;
3406  }
3407 
3408  // Print out __kmp_thread_pool and __kmp_team_pool.
3409  __kmp_printf("\n------------------------------\nPools\n----------------------"
3410  "--------\n");
3411  __kmp_print_structure_thread("Thread pool: ",
3412  CCAST(kmp_info_t *, __kmp_thread_pool));
3413  __kmp_print_structure_team("Team pool: ",
3414  CCAST(kmp_team_t *, __kmp_team_pool));
3415  __kmp_printf("\n");
3416 
3417  // Free team list.
3418  while (list != NULL) {
3419  kmp_team_list_item_t *item = list;
3420  list = list->next;
3421  KMP_INTERNAL_FREE(item);
3422  }
3423 }
3424 
3425 #endif
3426 
3427 //---------------------------------------------------------------------------
3428 // Stuff for per-thread fast random number generator
3429 // Table of primes
3430 static const unsigned __kmp_primes[] = {
3431  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3432  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3433  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3434  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3435  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3436  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3437  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3438  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3439  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3440  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3441  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3442 
3443 //---------------------------------------------------------------------------
3444 // __kmp_get_random: Get a random number using a linear congruential method.
3445 unsigned short __kmp_get_random(kmp_info_t *thread) {
3446  unsigned x = thread->th.th_x;
3447  unsigned short r = x >> 16;
3448 
3449  thread->th.th_x = x * thread->th.th_a + 1;
3450 
3451  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3452  thread->th.th_info.ds.ds_tid, r));
3453 
3454  return r;
3455 }
3456 //--------------------------------------------------------
3457 // __kmp_init_random: Initialize a random number generator
3458 void __kmp_init_random(kmp_info_t *thread) {
3459  unsigned seed = thread->th.th_info.ds.ds_tid;
3460 
3461  thread->th.th_a =
3462  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3463  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3464  KA_TRACE(30,
3465  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3466 }
3467 
3468 #if KMP_OS_WINDOWS
3469 /* reclaim array entries for root threads that are already dead, returns number
3470  * reclaimed */
3471 static int __kmp_reclaim_dead_roots(void) {
3472  int i, r = 0;
3473 
3474  for (i = 0; i < __kmp_threads_capacity; ++i) {
3475  if (KMP_UBER_GTID(i) &&
3476  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3477  !__kmp_root[i]
3478  ->r.r_active) { // AC: reclaim only roots died in non-active state
3479  r += __kmp_unregister_root_other_thread(i);
3480  }
3481  }
3482  return r;
3483 }
3484 #endif
3485 
3486 /* This function attempts to create free entries in __kmp_threads and
3487  __kmp_root, and returns the number of free entries generated.
3488 
3489  For Windows* OS static library, the first mechanism used is to reclaim array
3490  entries for root threads that are already dead.
3491 
3492  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3493  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3494  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3495  threadprivate cache array has been created. Synchronization with
3496  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3497 
3498  After any dead root reclamation, if the clipping value allows array expansion
3499  to result in the generation of a total of nNeed free slots, the function does
3500  that expansion. If not, nothing is done beyond the possible initial root
3501  thread reclamation.
3502 
3503  If any argument is negative, the behavior is undefined. */
3504 static int __kmp_expand_threads(int nNeed) {
3505  int added = 0;
3506  int minimumRequiredCapacity;
3507  int newCapacity;
3508  kmp_info_t **newThreads;
3509  kmp_root_t **newRoot;
3510 
3511 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3512 // resizing __kmp_threads does not need additional protection if foreign
3513 // threads are present
3514 
3515 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3516  /* only for Windows static library */
3517  /* reclaim array entries for root threads that are already dead */
3518  added = __kmp_reclaim_dead_roots();
3519 
3520  if (nNeed) {
3521  nNeed -= added;
3522  if (nNeed < 0)
3523  nNeed = 0;
3524  }
3525 #endif
3526  if (nNeed <= 0)
3527  return added;
3528 
3529  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3530  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3531  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3532  // > __kmp_max_nth in one of two ways:
3533  //
3534  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3535  // may not be reused by another thread, so we may need to increase
3536  // __kmp_threads_capacity to __kmp_max_nth + 1.
3537  //
3538  // 2) New foreign root(s) are encountered. We always register new foreign
3539  // roots. This may cause a smaller # of threads to be allocated at
3540  // subsequent parallel regions, but the worker threads hang around (and
3541  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3542  //
3543  // Anyway, that is the reason for moving the check to see if
3544  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3545  // instead of having it performed here. -BB
3546 
3547  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3548 
3549  /* compute expansion headroom to check if we can expand */
3550  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3551  /* possible expansion too small -- give up */
3552  return added;
3553  }
3554  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3555 
3556  newCapacity = __kmp_threads_capacity;
3557  do {
3558  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3559  : __kmp_sys_max_nth;
3560  } while (newCapacity < minimumRequiredCapacity);
3561  newThreads = (kmp_info_t **)__kmp_allocate(
3562  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3563  newRoot =
3564  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3565  KMP_MEMCPY(newThreads, __kmp_threads,
3566  __kmp_threads_capacity * sizeof(kmp_info_t *));
3567  KMP_MEMCPY(newRoot, __kmp_root,
3568  __kmp_threads_capacity * sizeof(kmp_root_t *));
3569 
3570  kmp_info_t **temp_threads = __kmp_threads;
3571  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3572  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3573  __kmp_free(temp_threads);
3574  added += newCapacity - __kmp_threads_capacity;
3575  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3576 
3577  if (newCapacity > __kmp_tp_capacity) {
3578  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3579  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3580  __kmp_threadprivate_resize_cache(newCapacity);
3581  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3582  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3583  }
3584  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3585  }
3586 
3587  return added;
3588 }
3589 
3590 /* Register the current thread as a root thread and obtain our gtid. We must
3591  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3592  thread that calls from __kmp_do_serial_initialize() */
3593 int __kmp_register_root(int initial_thread) {
3594  kmp_info_t *root_thread;
3595  kmp_root_t *root;
3596  int gtid;
3597  int capacity;
3598  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3599  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3600  KMP_MB();
3601 
3602  /* 2007-03-02:
3603  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3604  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3605  work as expected -- it may return false (that means there is at least one
3606  empty slot in __kmp_threads array), but it is possible the only free slot
3607  is #0, which is reserved for initial thread and so cannot be used for this
3608  one. Following code workarounds this bug.
3609 
3610  However, right solution seems to be not reserving slot #0 for initial
3611  thread because:
3612  (1) there is no magic in slot #0,
3613  (2) we cannot detect initial thread reliably (the first thread which does
3614  serial initialization may be not a real initial thread).
3615  */
3616  capacity = __kmp_threads_capacity;
3617  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3618  --capacity;
3619  }
3620 
3621  /* see if there are too many threads */
3622  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3623  if (__kmp_tp_cached) {
3624  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3625  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3626  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3627  } else {
3628  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3629  __kmp_msg_null);
3630  }
3631  }
3632 
3633  /* find an available thread slot */
3634  /* Don't reassign the zero slot since we need that to only be used by initial
3635  thread */
3636  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3637  gtid++)
3638  ;
3639  KA_TRACE(1,
3640  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3641  KMP_ASSERT(gtid < __kmp_threads_capacity);
3642 
3643  /* update global accounting */
3644  __kmp_all_nth++;
3645  TCW_4(__kmp_nth, __kmp_nth + 1);
3646 
3647  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3648  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3649  if (__kmp_adjust_gtid_mode) {
3650  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3651  if (TCR_4(__kmp_gtid_mode) != 2) {
3652  TCW_4(__kmp_gtid_mode, 2);
3653  }
3654  } else {
3655  if (TCR_4(__kmp_gtid_mode) != 1) {
3656  TCW_4(__kmp_gtid_mode, 1);
3657  }
3658  }
3659  }
3660 
3661 #ifdef KMP_ADJUST_BLOCKTIME
3662  /* Adjust blocktime to zero if necessary */
3663  /* Middle initialization might not have occurred yet */
3664  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3665  if (__kmp_nth > __kmp_avail_proc) {
3666  __kmp_zero_bt = TRUE;
3667  }
3668  }
3669 #endif /* KMP_ADJUST_BLOCKTIME */
3670 
3671  /* setup this new hierarchy */
3672  if (!(root = __kmp_root[gtid])) {
3673  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3674  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3675  }
3676 
3677 #if KMP_STATS_ENABLED
3678  // Initialize stats as soon as possible (right after gtid assignment).
3679  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3680  __kmp_stats_thread_ptr->startLife();
3681  KMP_SET_THREAD_STATE(SERIAL_REGION);
3682  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3683 #endif
3684  __kmp_initialize_root(root);
3685 
3686  /* setup new root thread structure */
3687  if (root->r.r_uber_thread) {
3688  root_thread = root->r.r_uber_thread;
3689  } else {
3690  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3691  if (__kmp_storage_map) {
3692  __kmp_print_thread_storage_map(root_thread, gtid);
3693  }
3694  root_thread->th.th_info.ds.ds_gtid = gtid;
3695 #if OMPT_SUPPORT
3696  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3697 #endif
3698  root_thread->th.th_root = root;
3699  if (__kmp_env_consistency_check) {
3700  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3701  }
3702 #if USE_FAST_MEMORY
3703  __kmp_initialize_fast_memory(root_thread);
3704 #endif /* USE_FAST_MEMORY */
3705 
3706 #if KMP_USE_BGET
3707  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3708  __kmp_initialize_bget(root_thread);
3709 #endif
3710  __kmp_init_random(root_thread); // Initialize random number generator
3711  }
3712 
3713  /* setup the serial team held in reserve by the root thread */
3714  if (!root_thread->th.th_serial_team) {
3715  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3716  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3717  root_thread->th.th_serial_team = __kmp_allocate_team(
3718  root, 1, 1,
3719 #if OMPT_SUPPORT
3720  ompt_data_none, // root parallel id
3721 #endif
3722  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3723  }
3724  KMP_ASSERT(root_thread->th.th_serial_team);
3725  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3726  root_thread->th.th_serial_team));
3727 
3728  /* drop root_thread into place */
3729  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3730 
3731  root->r.r_root_team->t.t_threads[0] = root_thread;
3732  root->r.r_hot_team->t.t_threads[0] = root_thread;
3733  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3734  // AC: the team created in reserve, not for execution (it is unused for now).
3735  root_thread->th.th_serial_team->t.t_serialized = 0;
3736  root->r.r_uber_thread = root_thread;
3737 
3738  /* initialize the thread, get it ready to go */
3739  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3740  TCW_4(__kmp_init_gtid, TRUE);
3741 
3742  /* prepare the master thread for get_gtid() */
3743  __kmp_gtid_set_specific(gtid);
3744 
3745 #if USE_ITT_BUILD
3746  __kmp_itt_thread_name(gtid);
3747 #endif /* USE_ITT_BUILD */
3748 
3749 #ifdef KMP_TDATA_GTID
3750  __kmp_gtid = gtid;
3751 #endif
3752  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3753  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3754 
3755  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3756  "plain=%u\n",
3757  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3758  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3759  KMP_INIT_BARRIER_STATE));
3760  { // Initialize barrier data.
3761  int b;
3762  for (b = 0; b < bs_last_barrier; ++b) {
3763  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3764 #if USE_DEBUGGER
3765  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3766 #endif
3767  }
3768  }
3769  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3770  KMP_INIT_BARRIER_STATE);
3771 
3772 #if KMP_AFFINITY_SUPPORTED
3773  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3774  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3775  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3776  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3777  if (TCR_4(__kmp_init_middle)) {
3778  __kmp_affinity_set_init_mask(gtid, TRUE);
3779  }
3780 #endif /* KMP_AFFINITY_SUPPORTED */
3781  root_thread->th.th_def_allocator = __kmp_def_allocator;
3782  root_thread->th.th_prev_level = 0;
3783  root_thread->th.th_prev_num_threads = 1;
3784 
3785  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3786  tmp->cg_root = root_thread;
3787  tmp->cg_thread_limit = __kmp_cg_max_nth;
3788  tmp->cg_nthreads = 1;
3789  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3790  " cg_nthreads init to 1\n",
3791  root_thread, tmp));
3792  tmp->up = NULL;
3793  root_thread->th.th_cg_roots = tmp;
3794 
3795  __kmp_root_counter++;
3796 
3797 #if OMPT_SUPPORT
3798  if (!initial_thread && ompt_enabled.enabled) {
3799 
3800  kmp_info_t *root_thread = ompt_get_thread();
3801 
3802  ompt_set_thread_state(root_thread, ompt_state_overhead);
3803 
3804  if (ompt_enabled.ompt_callback_thread_begin) {
3805  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3806  ompt_thread_initial, __ompt_get_thread_data_internal());
3807  }
3808  ompt_data_t *task_data;
3809  ompt_data_t *parallel_data;
3810  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3811  if (ompt_enabled.ompt_callback_implicit_task) {
3812  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3813  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3814  }
3815 
3816  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3817  }
3818 #endif
3819 
3820  KMP_MB();
3821  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3822 
3823  return gtid;
3824 }
3825 
3826 #if KMP_NESTED_HOT_TEAMS
3827 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3828  const int max_level) {
3829  int i, n, nth;
3830  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3831  if (!hot_teams || !hot_teams[level].hot_team) {
3832  return 0;
3833  }
3834  KMP_DEBUG_ASSERT(level < max_level);
3835  kmp_team_t *team = hot_teams[level].hot_team;
3836  nth = hot_teams[level].hot_team_nth;
3837  n = nth - 1; // master is not freed
3838  if (level < max_level - 1) {
3839  for (i = 0; i < nth; ++i) {
3840  kmp_info_t *th = team->t.t_threads[i];
3841  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3842  if (i > 0 && th->th.th_hot_teams) {
3843  __kmp_free(th->th.th_hot_teams);
3844  th->th.th_hot_teams = NULL;
3845  }
3846  }
3847  }
3848  __kmp_free_team(root, team, NULL);
3849  return n;
3850 }
3851 #endif
3852 
3853 // Resets a root thread and clear its root and hot teams.
3854 // Returns the number of __kmp_threads entries directly and indirectly freed.
3855 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3856  kmp_team_t *root_team = root->r.r_root_team;
3857  kmp_team_t *hot_team = root->r.r_hot_team;
3858  int n = hot_team->t.t_nproc;
3859  int i;
3860 
3861  KMP_DEBUG_ASSERT(!root->r.r_active);
3862 
3863  root->r.r_root_team = NULL;
3864  root->r.r_hot_team = NULL;
3865  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3866  // before call to __kmp_free_team().
3867  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3868 #if KMP_NESTED_HOT_TEAMS
3869  if (__kmp_hot_teams_max_level >
3870  0) { // need to free nested hot teams and their threads if any
3871  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3872  kmp_info_t *th = hot_team->t.t_threads[i];
3873  if (__kmp_hot_teams_max_level > 1) {
3874  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3875  }
3876  if (th->th.th_hot_teams) {
3877  __kmp_free(th->th.th_hot_teams);
3878  th->th.th_hot_teams = NULL;
3879  }
3880  }
3881  }
3882 #endif
3883  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3884 
3885  // Before we can reap the thread, we need to make certain that all other
3886  // threads in the teams that had this root as ancestor have stopped trying to
3887  // steal tasks.
3888  if (__kmp_tasking_mode != tskm_immediate_exec) {
3889  __kmp_wait_to_unref_task_teams();
3890  }
3891 
3892 #if KMP_OS_WINDOWS
3893  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3894  KA_TRACE(
3895  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3896  "\n",
3897  (LPVOID) & (root->r.r_uber_thread->th),
3898  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3899  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3900 #endif /* KMP_OS_WINDOWS */
3901 
3902 #if OMPT_SUPPORT
3903  ompt_data_t *task_data;
3904  ompt_data_t *parallel_data;
3905  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3906  if (ompt_enabled.ompt_callback_implicit_task) {
3907  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3908  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3909  }
3910  if (ompt_enabled.ompt_callback_thread_end) {
3911  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3912  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3913  }
3914 #endif
3915 
3916  TCW_4(__kmp_nth,
3917  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3918  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3919  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3920  " to %d\n",
3921  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3922  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3923  if (i == 1) {
3924  // need to free contention group structure
3925  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3926  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3927  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3928  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3929  root->r.r_uber_thread->th.th_cg_roots = NULL;
3930  }
3931  __kmp_reap_thread(root->r.r_uber_thread, 1);
3932 
3933  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3934  // instead of freeing.
3935  root->r.r_uber_thread = NULL;
3936  /* mark root as no longer in use */
3937  root->r.r_begin = FALSE;
3938 
3939  return n;
3940 }
3941 
3942 void __kmp_unregister_root_current_thread(int gtid) {
3943  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3944  /* this lock should be ok, since unregister_root_current_thread is never
3945  called during an abort, only during a normal close. furthermore, if you
3946  have the forkjoin lock, you should never try to get the initz lock */
3947  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3948  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3949  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3950  "exiting T#%d\n",
3951  gtid));
3952  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3953  return;
3954  }
3955  kmp_root_t *root = __kmp_root[gtid];
3956 
3957  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3958  KMP_ASSERT(KMP_UBER_GTID(gtid));
3959  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3960  KMP_ASSERT(root->r.r_active == FALSE);
3961 
3962  KMP_MB();
3963 
3964  kmp_info_t *thread = __kmp_threads[gtid];
3965  kmp_team_t *team = thread->th.th_team;
3966  kmp_task_team_t *task_team = thread->th.th_task_team;
3967 
3968  // we need to wait for the proxy tasks before finishing the thread
3969  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3970 #if OMPT_SUPPORT
3971  // the runtime is shutting down so we won't report any events
3972  thread->th.ompt_thread_info.state = ompt_state_undefined;
3973 #endif
3974  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3975  }
3976 
3977  __kmp_reset_root(gtid, root);
3978 
3979  /* free up this thread slot */
3980  __kmp_gtid_set_specific(KMP_GTID_DNE);
3981 #ifdef KMP_TDATA_GTID
3982  __kmp_gtid = KMP_GTID_DNE;
3983 #endif
3984 
3985  KMP_MB();
3986  KC_TRACE(10,
3987  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3988 
3989  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3990 }
3991 
3992 #if KMP_OS_WINDOWS
3993 /* __kmp_forkjoin_lock must be already held
3994  Unregisters a root thread that is not the current thread. Returns the number
3995  of __kmp_threads entries freed as a result. */
3996 static int __kmp_unregister_root_other_thread(int gtid) {
3997  kmp_root_t *root = __kmp_root[gtid];
3998  int r;
3999 
4000  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4001  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4002  KMP_ASSERT(KMP_UBER_GTID(gtid));
4003  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4004  KMP_ASSERT(root->r.r_active == FALSE);
4005 
4006  r = __kmp_reset_root(gtid, root);
4007  KC_TRACE(10,
4008  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4009  return r;
4010 }
4011 #endif
4012 
4013 #if KMP_DEBUG
4014 void __kmp_task_info() {
4015 
4016  kmp_int32 gtid = __kmp_entry_gtid();
4017  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4018  kmp_info_t *this_thr = __kmp_threads[gtid];
4019  kmp_team_t *steam = this_thr->th.th_serial_team;
4020  kmp_team_t *team = this_thr->th.th_team;
4021 
4022  __kmp_printf(
4023  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4024  "ptask=%p\n",
4025  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4026  team->t.t_implicit_task_taskdata[tid].td_parent);
4027 }
4028 #endif // KMP_DEBUG
4029 
4030 /* TODO optimize with one big memclr, take out what isn't needed, split
4031  responsibility to workers as much as possible, and delay initialization of
4032  features as much as possible */
4033 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4034  int tid, int gtid) {
4035  /* this_thr->th.th_info.ds.ds_gtid is setup in
4036  kmp_allocate_thread/create_worker.
4037  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4038  kmp_info_t *master = team->t.t_threads[0];
4039  KMP_DEBUG_ASSERT(this_thr != NULL);
4040  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4041  KMP_DEBUG_ASSERT(team);
4042  KMP_DEBUG_ASSERT(team->t.t_threads);
4043  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4044  KMP_DEBUG_ASSERT(master);
4045  KMP_DEBUG_ASSERT(master->th.th_root);
4046 
4047  KMP_MB();
4048 
4049  TCW_SYNC_PTR(this_thr->th.th_team, team);
4050 
4051  this_thr->th.th_info.ds.ds_tid = tid;
4052  this_thr->th.th_set_nproc = 0;
4053  if (__kmp_tasking_mode != tskm_immediate_exec)
4054  // When tasking is possible, threads are not safe to reap until they are
4055  // done tasking; this will be set when tasking code is exited in wait
4056  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4057  else // no tasking --> always safe to reap
4058  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4059  this_thr->th.th_set_proc_bind = proc_bind_default;
4060 #if KMP_AFFINITY_SUPPORTED
4061  this_thr->th.th_new_place = this_thr->th.th_current_place;
4062 #endif
4063  this_thr->th.th_root = master->th.th_root;
4064 
4065  /* setup the thread's cache of the team structure */
4066  this_thr->th.th_team_nproc = team->t.t_nproc;
4067  this_thr->th.th_team_master = master;
4068  this_thr->th.th_team_serialized = team->t.t_serialized;
4069  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4070 
4071  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4072 
4073  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4074  tid, gtid, this_thr, this_thr->th.th_current_task));
4075 
4076  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4077  team, tid, TRUE);
4078 
4079  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4080  tid, gtid, this_thr, this_thr->th.th_current_task));
4081  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4082  // __kmp_initialize_team()?
4083 
4084  /* TODO no worksharing in speculative threads */
4085  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4086 
4087  this_thr->th.th_local.this_construct = 0;
4088 
4089  if (!this_thr->th.th_pri_common) {
4090  this_thr->th.th_pri_common =
4091  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4092  if (__kmp_storage_map) {
4093  __kmp_print_storage_map_gtid(
4094  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4095  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4096  }
4097  this_thr->th.th_pri_head = NULL;
4098  }
4099 
4100  if (this_thr != master && // Master's CG root is initialized elsewhere
4101  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4102  // Make new thread's CG root same as master's
4103  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4104  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4105  if (tmp) {
4106  // worker changes CG, need to check if old CG should be freed
4107  int i = tmp->cg_nthreads--;
4108  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4109  " on node %p of thread %p to %d\n",
4110  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4111  if (i == 1) {
4112  __kmp_free(tmp); // last thread left CG --> free it
4113  }
4114  }
4115  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4116  // Increment new thread's CG root's counter to add the new thread
4117  this_thr->th.th_cg_roots->cg_nthreads++;
4118  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4119  " node %p of thread %p to %d\n",
4120  this_thr, this_thr->th.th_cg_roots,
4121  this_thr->th.th_cg_roots->cg_root,
4122  this_thr->th.th_cg_roots->cg_nthreads));
4123  this_thr->th.th_current_task->td_icvs.thread_limit =
4124  this_thr->th.th_cg_roots->cg_thread_limit;
4125  }
4126 
4127  /* Initialize dynamic dispatch */
4128  {
4129  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4130  // Use team max_nproc since this will never change for the team.
4131  size_t disp_size =
4132  sizeof(dispatch_private_info_t) *
4133  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4134  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4135  team->t.t_max_nproc));
4136  KMP_ASSERT(dispatch);
4137  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4138  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4139 
4140  dispatch->th_disp_index = 0;
4141  dispatch->th_doacross_buf_idx = 0;
4142  if (!dispatch->th_disp_buffer) {
4143  dispatch->th_disp_buffer =
4144  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4145 
4146  if (__kmp_storage_map) {
4147  __kmp_print_storage_map_gtid(
4148  gtid, &dispatch->th_disp_buffer[0],
4149  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4150  ? 1
4151  : __kmp_dispatch_num_buffers],
4152  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4153  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4154  gtid, team->t.t_id, gtid);
4155  }
4156  } else {
4157  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4158  }
4159 
4160  dispatch->th_dispatch_pr_current = 0;
4161  dispatch->th_dispatch_sh_current = 0;
4162 
4163  dispatch->th_deo_fcn = 0; /* ORDERED */
4164  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4165  }
4166 
4167  this_thr->th.th_next_pool = NULL;
4168 
4169  if (!this_thr->th.th_task_state_memo_stack) {
4170  size_t i;
4171  this_thr->th.th_task_state_memo_stack =
4172  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4173  this_thr->th.th_task_state_top = 0;
4174  this_thr->th.th_task_state_stack_sz = 4;
4175  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4176  ++i) // zero init the stack
4177  this_thr->th.th_task_state_memo_stack[i] = 0;
4178  }
4179 
4180  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4181  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4182 
4183  KMP_MB();
4184 }
4185 
4186 /* allocate a new thread for the requesting team. this is only called from
4187  within a forkjoin critical section. we will first try to get an available
4188  thread from the thread pool. if none is available, we will fork a new one
4189  assuming we are able to create a new one. this should be assured, as the
4190  caller should check on this first. */
4191 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4192  int new_tid) {
4193  kmp_team_t *serial_team;
4194  kmp_info_t *new_thr;
4195  int new_gtid;
4196 
4197  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4198  KMP_DEBUG_ASSERT(root && team);
4199 #if !KMP_NESTED_HOT_TEAMS
4200  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4201 #endif
4202  KMP_MB();
4203 
4204  /* first, try to get one from the thread pool */
4205  if (__kmp_thread_pool) {
4206  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4207  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4208  if (new_thr == __kmp_thread_pool_insert_pt) {
4209  __kmp_thread_pool_insert_pt = NULL;
4210  }
4211  TCW_4(new_thr->th.th_in_pool, FALSE);
4212  __kmp_suspend_initialize_thread(new_thr);
4213  __kmp_lock_suspend_mx(new_thr);
4214  if (new_thr->th.th_active_in_pool == TRUE) {
4215  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4216  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4217  new_thr->th.th_active_in_pool = FALSE;
4218  }
4219  __kmp_unlock_suspend_mx(new_thr);
4220 
4221  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4222  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4223  KMP_ASSERT(!new_thr->th.th_team);
4224  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4225 
4226  /* setup the thread structure */
4227  __kmp_initialize_info(new_thr, team, new_tid,
4228  new_thr->th.th_info.ds.ds_gtid);
4229  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4230 
4231  TCW_4(__kmp_nth, __kmp_nth + 1);
4232 
4233  new_thr->th.th_task_state = 0;
4234  new_thr->th.th_task_state_top = 0;
4235  new_thr->th.th_task_state_stack_sz = 4;
4236 
4237 #ifdef KMP_ADJUST_BLOCKTIME
4238  /* Adjust blocktime back to zero if necessary */
4239  /* Middle initialization might not have occurred yet */
4240  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4241  if (__kmp_nth > __kmp_avail_proc) {
4242  __kmp_zero_bt = TRUE;
4243  }
4244  }
4245 #endif /* KMP_ADJUST_BLOCKTIME */
4246 
4247 #if KMP_DEBUG
4248  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4249  // KMP_BARRIER_PARENT_FLAG.
4250  int b;
4251  kmp_balign_t *balign = new_thr->th.th_bar;
4252  for (b = 0; b < bs_last_barrier; ++b)
4253  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4254 #endif
4255 
4256  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4257  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4258 
4259  KMP_MB();
4260  return new_thr;
4261  }
4262 
4263  /* no, well fork a new one */
4264  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4265  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4266 
4267 #if KMP_USE_MONITOR
4268  // If this is the first worker thread the RTL is creating, then also
4269  // launch the monitor thread. We try to do this as early as possible.
4270  if (!TCR_4(__kmp_init_monitor)) {
4271  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4272  if (!TCR_4(__kmp_init_monitor)) {
4273  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4274  TCW_4(__kmp_init_monitor, 1);
4275  __kmp_create_monitor(&__kmp_monitor);
4276  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4277 #if KMP_OS_WINDOWS
4278  // AC: wait until monitor has started. This is a fix for CQ232808.
4279  // The reason is that if the library is loaded/unloaded in a loop with
4280  // small (parallel) work in between, then there is high probability that
4281  // monitor thread started after the library shutdown. At shutdown it is
4282  // too late to cope with the problem, because when the master is in
4283  // DllMain (process detach) the monitor has no chances to start (it is
4284  // blocked), and master has no means to inform the monitor that the
4285  // library has gone, because all the memory which the monitor can access
4286  // is going to be released/reset.
4287  while (TCR_4(__kmp_init_monitor) < 2) {
4288  KMP_YIELD(TRUE);
4289  }
4290  KF_TRACE(10, ("after monitor thread has started\n"));
4291 #endif
4292  }
4293  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4294  }
4295 #endif
4296 
4297  KMP_MB();
4298  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4299  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4300  }
4301 
4302  /* allocate space for it. */
4303  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4304 
4305  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4306 
4307 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4308  // suppress race conditions detection on synchronization flags in debug mode
4309  // this helps to analyze library internals eliminating false positives
4310  __itt_suppress_mark_range(
4311  __itt_suppress_range, __itt_suppress_threading_errors,
4312  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4313  __itt_suppress_mark_range(
4314  __itt_suppress_range, __itt_suppress_threading_errors,
4315  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4316 #if KMP_OS_WINDOWS
4317  __itt_suppress_mark_range(
4318  __itt_suppress_range, __itt_suppress_threading_errors,
4319  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4320 #else
4321  __itt_suppress_mark_range(__itt_suppress_range,
4322  __itt_suppress_threading_errors,
4323  &new_thr->th.th_suspend_init_count,
4324  sizeof(new_thr->th.th_suspend_init_count));
4325 #endif
4326  // TODO: check if we need to also suppress b_arrived flags
4327  __itt_suppress_mark_range(__itt_suppress_range,
4328  __itt_suppress_threading_errors,
4329  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4330  sizeof(new_thr->th.th_bar[0].bb.b_go));
4331  __itt_suppress_mark_range(__itt_suppress_range,
4332  __itt_suppress_threading_errors,
4333  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4334  sizeof(new_thr->th.th_bar[1].bb.b_go));
4335  __itt_suppress_mark_range(__itt_suppress_range,
4336  __itt_suppress_threading_errors,
4337  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4338  sizeof(new_thr->th.th_bar[2].bb.b_go));
4339 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4340  if (__kmp_storage_map) {
4341  __kmp_print_thread_storage_map(new_thr, new_gtid);
4342  }
4343 
4344  // add the reserve serialized team, initialized from the team's master thread
4345  {
4346  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4347  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4348  new_thr->th.th_serial_team = serial_team =
4349  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4350 #if OMPT_SUPPORT
4351  ompt_data_none, // root parallel id
4352 #endif
4353  proc_bind_default, &r_icvs,
4354  0 USE_NESTED_HOT_ARG(NULL));
4355  }
4356  KMP_ASSERT(serial_team);
4357  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4358  // execution (it is unused for now).
4359  serial_team->t.t_threads[0] = new_thr;
4360  KF_TRACE(10,
4361  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4362  new_thr));
4363 
4364  /* setup the thread structures */
4365  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4366 
4367 #if USE_FAST_MEMORY
4368  __kmp_initialize_fast_memory(new_thr);
4369 #endif /* USE_FAST_MEMORY */
4370 
4371 #if KMP_USE_BGET
4372  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4373  __kmp_initialize_bget(new_thr);
4374 #endif
4375 
4376  __kmp_init_random(new_thr); // Initialize random number generator
4377 
4378  /* Initialize these only once when thread is grabbed for a team allocation */
4379  KA_TRACE(20,
4380  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4381  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4382 
4383  int b;
4384  kmp_balign_t *balign = new_thr->th.th_bar;
4385  for (b = 0; b < bs_last_barrier; ++b) {
4386  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4387  balign[b].bb.team = NULL;
4388  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4389  balign[b].bb.use_oncore_barrier = 0;
4390  }
4391 
4392  new_thr->th.th_spin_here = FALSE;
4393  new_thr->th.th_next_waiting = 0;
4394 #if KMP_OS_UNIX
4395  new_thr->th.th_blocking = false;
4396 #endif
4397 
4398 #if KMP_AFFINITY_SUPPORTED
4399  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4400  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4401  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4402  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4403 #endif
4404  new_thr->th.th_def_allocator = __kmp_def_allocator;
4405  new_thr->th.th_prev_level = 0;
4406  new_thr->th.th_prev_num_threads = 1;
4407 
4408  TCW_4(new_thr->th.th_in_pool, FALSE);
4409  new_thr->th.th_active_in_pool = FALSE;
4410  TCW_4(new_thr->th.th_active, TRUE);
4411 
4412  /* adjust the global counters */
4413  __kmp_all_nth++;
4414  __kmp_nth++;
4415 
4416  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4417  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4418  if (__kmp_adjust_gtid_mode) {
4419  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4420  if (TCR_4(__kmp_gtid_mode) != 2) {
4421  TCW_4(__kmp_gtid_mode, 2);
4422  }
4423  } else {
4424  if (TCR_4(__kmp_gtid_mode) != 1) {
4425  TCW_4(__kmp_gtid_mode, 1);
4426  }
4427  }
4428  }
4429 
4430 #ifdef KMP_ADJUST_BLOCKTIME
4431  /* Adjust blocktime back to zero if necessary */
4432  /* Middle initialization might not have occurred yet */
4433  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4434  if (__kmp_nth > __kmp_avail_proc) {
4435  __kmp_zero_bt = TRUE;
4436  }
4437  }
4438 #endif /* KMP_ADJUST_BLOCKTIME */
4439 
4440  /* actually fork it and create the new worker thread */
4441  KF_TRACE(
4442  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4443  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4444  KF_TRACE(10,
4445  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4446 
4447  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4448  new_gtid));
4449  KMP_MB();
4450  return new_thr;
4451 }
4452 
4453 /* Reinitialize team for reuse.
4454  The hot team code calls this case at every fork barrier, so EPCC barrier
4455  test are extremely sensitive to changes in it, esp. writes to the team
4456  struct, which cause a cache invalidation in all threads.
4457  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4458 static void __kmp_reinitialize_team(kmp_team_t *team,
4459  kmp_internal_control_t *new_icvs,
4460  ident_t *loc) {
4461  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4462  team->t.t_threads[0], team));
4463  KMP_DEBUG_ASSERT(team && new_icvs);
4464  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4465  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4466 
4467  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4468  // Copy ICVs to the master thread's implicit taskdata
4469  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4470  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4471 
4472  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4473  team->t.t_threads[0], team));
4474 }
4475 
4476 /* Initialize the team data structure.
4477  This assumes the t_threads and t_max_nproc are already set.
4478  Also, we don't touch the arguments */
4479 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4480  kmp_internal_control_t *new_icvs,
4481  ident_t *loc) {
4482  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4483 
4484  /* verify */
4485  KMP_DEBUG_ASSERT(team);
4486  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4487  KMP_DEBUG_ASSERT(team->t.t_threads);
4488  KMP_MB();
4489 
4490  team->t.t_master_tid = 0; /* not needed */
4491  /* team->t.t_master_bar; not needed */
4492  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4493  team->t.t_nproc = new_nproc;
4494 
4495  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4496  team->t.t_next_pool = NULL;
4497  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4498  * up hot team */
4499 
4500  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4501  team->t.t_invoke = NULL; /* not needed */
4502 
4503  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4504  team->t.t_sched.sched = new_icvs->sched.sched;
4505 
4506 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4507  team->t.t_fp_control_saved = FALSE; /* not needed */
4508  team->t.t_x87_fpu_control_word = 0; /* not needed */
4509  team->t.t_mxcsr = 0; /* not needed */
4510 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4511 
4512  team->t.t_construct = 0;
4513 
4514  team->t.t_ordered.dt.t_value = 0;
4515  team->t.t_master_active = FALSE;
4516 
4517 #ifdef KMP_DEBUG
4518  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4519 #endif
4520 #if KMP_OS_WINDOWS
4521  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4522 #endif
4523 
4524  team->t.t_control_stack_top = NULL;
4525 
4526  __kmp_reinitialize_team(team, new_icvs, loc);
4527 
4528  KMP_MB();
4529  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4530 }
4531 
4532 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4533 /* Sets full mask for thread and returns old mask, no changes to structures. */
4534 static void
4535 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4536  if (KMP_AFFINITY_CAPABLE()) {
4537  int status;
4538  if (old_mask != NULL) {
4539  status = __kmp_get_system_affinity(old_mask, TRUE);
4540  int error = errno;
4541  if (status != 0) {
4542  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4543  __kmp_msg_null);
4544  }
4545  }
4546  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4547  }
4548 }
4549 #endif
4550 
4551 #if KMP_AFFINITY_SUPPORTED
4552 
4553 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4554 // It calculates the worker + master thread's partition based upon the parent
4555 // thread's partition, and binds each worker to a thread in their partition.
4556 // The master thread's partition should already include its current binding.
4557 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4558  // Copy the master thread's place partition to the team struct
4559  kmp_info_t *master_th = team->t.t_threads[0];
4560  KMP_DEBUG_ASSERT(master_th != NULL);
4561  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4562  int first_place = master_th->th.th_first_place;
4563  int last_place = master_th->th.th_last_place;
4564  int masters_place = master_th->th.th_current_place;
4565  team->t.t_first_place = first_place;
4566  team->t.t_last_place = last_place;
4567 
4568  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4569  "bound to place %d partition = [%d,%d]\n",
4570  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4571  team->t.t_id, masters_place, first_place, last_place));
4572 
4573  switch (proc_bind) {
4574 
4575  case proc_bind_default:
4576  // serial teams might have the proc_bind policy set to proc_bind_default. It
4577  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4578  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4579  break;
4580 
4581  case proc_bind_master: {
4582  int f;
4583  int n_th = team->t.t_nproc;
4584  for (f = 1; f < n_th; f++) {
4585  kmp_info_t *th = team->t.t_threads[f];
4586  KMP_DEBUG_ASSERT(th != NULL);
4587  th->th.th_first_place = first_place;
4588  th->th.th_last_place = last_place;
4589  th->th.th_new_place = masters_place;
4590  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4591  team->t.t_display_affinity != 1) {
4592  team->t.t_display_affinity = 1;
4593  }
4594 
4595  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4596  "partition = [%d,%d]\n",
4597  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4598  f, masters_place, first_place, last_place));
4599  }
4600  } break;
4601 
4602  case proc_bind_close: {
4603  int f;
4604  int n_th = team->t.t_nproc;
4605  int n_places;
4606  if (first_place <= last_place) {
4607  n_places = last_place - first_place + 1;
4608  } else {
4609  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4610  }
4611  if (n_th <= n_places) {
4612  int place = masters_place;
4613  for (f = 1; f < n_th; f++) {
4614  kmp_info_t *th = team->t.t_threads[f];
4615  KMP_DEBUG_ASSERT(th != NULL);
4616 
4617  if (place == last_place) {
4618  place = first_place;
4619  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4620  place = 0;
4621  } else {
4622  place++;
4623  }
4624  th->th.th_first_place = first_place;
4625  th->th.th_last_place = last_place;
4626  th->th.th_new_place = place;
4627  if (__kmp_display_affinity && place != th->th.th_current_place &&
4628  team->t.t_display_affinity != 1) {
4629  team->t.t_display_affinity = 1;
4630  }
4631 
4632  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4633  "partition = [%d,%d]\n",
4634  __kmp_gtid_from_thread(team->t.t_threads[f]),
4635  team->t.t_id, f, place, first_place, last_place));
4636  }
4637  } else {
4638  int S, rem, gap, s_count;
4639  S = n_th / n_places;
4640  s_count = 0;
4641  rem = n_th - (S * n_places);
4642  gap = rem > 0 ? n_places / rem : n_places;
4643  int place = masters_place;
4644  int gap_ct = gap;
4645  for (f = 0; f < n_th; f++) {
4646  kmp_info_t *th = team->t.t_threads[f];
4647  KMP_DEBUG_ASSERT(th != NULL);
4648 
4649  th->th.th_first_place = first_place;
4650  th->th.th_last_place = last_place;
4651  th->th.th_new_place = place;
4652  if (__kmp_display_affinity && place != th->th.th_current_place &&
4653  team->t.t_display_affinity != 1) {
4654  team->t.t_display_affinity = 1;
4655  }
4656  s_count++;
4657 
4658  if ((s_count == S) && rem && (gap_ct == gap)) {
4659  // do nothing, add an extra thread to place on next iteration
4660  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4661  // we added an extra thread to this place; move to next place
4662  if (place == last_place) {
4663  place = first_place;
4664  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4665  place = 0;
4666  } else {
4667  place++;
4668  }
4669  s_count = 0;
4670  gap_ct = 1;
4671  rem--;
4672  } else if (s_count == S) { // place full; don't add extra
4673  if (place == last_place) {
4674  place = first_place;
4675  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4676  place = 0;
4677  } else {
4678  place++;
4679  }
4680  gap_ct++;
4681  s_count = 0;
4682  }
4683 
4684  KA_TRACE(100,
4685  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4686  "partition = [%d,%d]\n",
4687  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4688  th->th.th_new_place, first_place, last_place));
4689  }
4690  KMP_DEBUG_ASSERT(place == masters_place);
4691  }
4692  } break;
4693 
4694  case proc_bind_spread: {
4695  int f;
4696  int n_th = team->t.t_nproc;
4697  int n_places;
4698  int thidx;
4699  if (first_place <= last_place) {
4700  n_places = last_place - first_place + 1;
4701  } else {
4702  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4703  }
4704  if (n_th <= n_places) {
4705  int place = -1;
4706 
4707  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4708  int S = n_places / n_th;
4709  int s_count, rem, gap, gap_ct;
4710 
4711  place = masters_place;
4712  rem = n_places - n_th * S;
4713  gap = rem ? n_th / rem : 1;
4714  gap_ct = gap;
4715  thidx = n_th;
4716  if (update_master_only == 1)
4717  thidx = 1;
4718  for (f = 0; f < thidx; f++) {
4719  kmp_info_t *th = team->t.t_threads[f];
4720  KMP_DEBUG_ASSERT(th != NULL);
4721 
4722  th->th.th_first_place = place;
4723  th->th.th_new_place = place;
4724  if (__kmp_display_affinity && place != th->th.th_current_place &&
4725  team->t.t_display_affinity != 1) {
4726  team->t.t_display_affinity = 1;
4727  }
4728  s_count = 1;
4729  while (s_count < S) {
4730  if (place == last_place) {
4731  place = first_place;
4732  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4733  place = 0;
4734  } else {
4735  place++;
4736  }
4737  s_count++;
4738  }
4739  if (rem && (gap_ct == gap)) {
4740  if (place == last_place) {
4741  place = first_place;
4742  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4743  place = 0;
4744  } else {
4745  place++;
4746  }
4747  rem--;
4748  gap_ct = 0;
4749  }
4750  th->th.th_last_place = place;
4751  gap_ct++;
4752 
4753  if (place == last_place) {
4754  place = first_place;
4755  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4756  place = 0;
4757  } else {
4758  place++;
4759  }
4760 
4761  KA_TRACE(100,
4762  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4763  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4764  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4765  f, th->th.th_new_place, th->th.th_first_place,
4766  th->th.th_last_place, __kmp_affinity_num_masks));
4767  }
4768  } else {
4769  /* Having uniform space of available computation places I can create
4770  T partitions of round(P/T) size and put threads into the first
4771  place of each partition. */
4772  double current = static_cast<double>(masters_place);
4773  double spacing =
4774  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4775  int first, last;
4776  kmp_info_t *th;
4777 
4778  thidx = n_th + 1;
4779  if (update_master_only == 1)
4780  thidx = 1;
4781  for (f = 0; f < thidx; f++) {
4782  first = static_cast<int>(current);
4783  last = static_cast<int>(current + spacing) - 1;
4784  KMP_DEBUG_ASSERT(last >= first);
4785  if (first >= n_places) {
4786  if (masters_place) {
4787  first -= n_places;
4788  last -= n_places;
4789  if (first == (masters_place + 1)) {
4790  KMP_DEBUG_ASSERT(f == n_th);
4791  first--;
4792  }
4793  if (last == masters_place) {
4794  KMP_DEBUG_ASSERT(f == (n_th - 1));
4795  last--;
4796  }
4797  } else {
4798  KMP_DEBUG_ASSERT(f == n_th);
4799  first = 0;
4800  last = 0;
4801  }
4802  }
4803  if (last >= n_places) {
4804  last = (n_places - 1);
4805  }
4806  place = first;
4807  current += spacing;
4808  if (f < n_th) {
4809  KMP_DEBUG_ASSERT(0 <= first);
4810  KMP_DEBUG_ASSERT(n_places > first);
4811  KMP_DEBUG_ASSERT(0 <= last);
4812  KMP_DEBUG_ASSERT(n_places > last);
4813  KMP_DEBUG_ASSERT(last_place >= first_place);
4814  th = team->t.t_threads[f];
4815  KMP_DEBUG_ASSERT(th);
4816  th->th.th_first_place = first;
4817  th->th.th_new_place = place;
4818  th->th.th_last_place = last;
4819  if (__kmp_display_affinity && place != th->th.th_current_place &&
4820  team->t.t_display_affinity != 1) {
4821  team->t.t_display_affinity = 1;
4822  }
4823  KA_TRACE(100,
4824  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4825  "partition = [%d,%d], spacing = %.4f\n",
4826  __kmp_gtid_from_thread(team->t.t_threads[f]),
4827  team->t.t_id, f, th->th.th_new_place,
4828  th->th.th_first_place, th->th.th_last_place, spacing));
4829  }
4830  }
4831  }
4832  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4833  } else {
4834  int S, rem, gap, s_count;
4835  S = n_th / n_places;
4836  s_count = 0;
4837  rem = n_th - (S * n_places);
4838  gap = rem > 0 ? n_places / rem : n_places;
4839  int place = masters_place;
4840  int gap_ct = gap;
4841  thidx = n_th;
4842  if (update_master_only == 1)
4843  thidx = 1;
4844  for (f = 0; f < thidx; f++) {
4845  kmp_info_t *th = team->t.t_threads[f];
4846  KMP_DEBUG_ASSERT(th != NULL);
4847 
4848  th->th.th_first_place = place;
4849  th->th.th_last_place = place;
4850  th->th.th_new_place = place;
4851  if (__kmp_display_affinity && place != th->th.th_current_place &&
4852  team->t.t_display_affinity != 1) {
4853  team->t.t_display_affinity = 1;
4854  }
4855  s_count++;
4856 
4857  if ((s_count == S) && rem && (gap_ct == gap)) {
4858  // do nothing, add an extra thread to place on next iteration
4859  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4860  // we added an extra thread to this place; move on to next place
4861  if (place == last_place) {
4862  place = first_place;
4863  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4864  place = 0;
4865  } else {
4866  place++;
4867  }
4868  s_count = 0;
4869  gap_ct = 1;
4870  rem--;
4871  } else if (s_count == S) { // place is full; don't add extra thread
4872  if (place == last_place) {
4873  place = first_place;
4874  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4875  place = 0;
4876  } else {
4877  place++;
4878  }
4879  gap_ct++;
4880  s_count = 0;
4881  }
4882 
4883  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4884  "partition = [%d,%d]\n",
4885  __kmp_gtid_from_thread(team->t.t_threads[f]),
4886  team->t.t_id, f, th->th.th_new_place,
4887  th->th.th_first_place, th->th.th_last_place));
4888  }
4889  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4890  }
4891  } break;
4892 
4893  default:
4894  break;
4895  }
4896 
4897  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4898 }
4899 
4900 #endif // KMP_AFFINITY_SUPPORTED
4901 
4902 /* allocate a new team data structure to use. take one off of the free pool if
4903  available */
4904 kmp_team_t *
4905 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4906 #if OMPT_SUPPORT
4907  ompt_data_t ompt_parallel_data,
4908 #endif
4909  kmp_proc_bind_t new_proc_bind,
4910  kmp_internal_control_t *new_icvs,
4911  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4912  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4913  int f;
4914  kmp_team_t *team;
4915  int use_hot_team = !root->r.r_active;
4916  int level = 0;
4917 
4918  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4919  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4920  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4921  KMP_MB();
4922 
4923 #if KMP_NESTED_HOT_TEAMS
4924  kmp_hot_team_ptr_t *hot_teams;
4925  if (master) {
4926  team = master->th.th_team;
4927  level = team->t.t_active_level;
4928  if (master->th.th_teams_microtask) { // in teams construct?
4929  if (master->th.th_teams_size.nteams > 1 &&
4930  ( // #teams > 1
4931  team->t.t_pkfn ==
4932  (microtask_t)__kmp_teams_master || // inner fork of the teams
4933  master->th.th_teams_level <
4934  team->t.t_level)) { // or nested parallel inside the teams
4935  ++level; // not increment if #teams==1, or for outer fork of the teams;
4936  // increment otherwise
4937  }
4938  }
4939  hot_teams = master->th.th_hot_teams;
4940  if (level < __kmp_hot_teams_max_level && hot_teams &&
4941  hot_teams[level].hot_team) {
4942  // hot team has already been allocated for given level
4943  use_hot_team = 1;
4944  } else {
4945  use_hot_team = 0;
4946  }
4947  } else {
4948  // check we won't access uninitialized hot_teams, just in case
4949  KMP_DEBUG_ASSERT(new_nproc == 1);
4950  }
4951 #endif
4952  // Optimization to use a "hot" team
4953  if (use_hot_team && new_nproc > 1) {
4954  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4955 #if KMP_NESTED_HOT_TEAMS
4956  team = hot_teams[level].hot_team;
4957 #else
4958  team = root->r.r_hot_team;
4959 #endif
4960 #if KMP_DEBUG
4961  if (__kmp_tasking_mode != tskm_immediate_exec) {
4962  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4963  "task_team[1] = %p before reinit\n",
4964  team->t.t_task_team[0], team->t.t_task_team[1]));
4965  }
4966 #endif
4967 
4968  // Has the number of threads changed?
4969  /* Let's assume the most common case is that the number of threads is
4970  unchanged, and put that case first. */
4971  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4972  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4973  // This case can mean that omp_set_num_threads() was called and the hot
4974  // team size was already reduced, so we check the special flag
4975  if (team->t.t_size_changed == -1) {
4976  team->t.t_size_changed = 1;
4977  } else {
4978  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4979  }
4980 
4981  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4982  kmp_r_sched_t new_sched = new_icvs->sched;
4983  // set master's schedule as new run-time schedule
4984  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4985 
4986  __kmp_reinitialize_team(team, new_icvs,
4987  root->r.r_uber_thread->th.th_ident);
4988 
4989  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4990  team->t.t_threads[0], team));
4991  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4992 
4993 #if KMP_AFFINITY_SUPPORTED
4994  if ((team->t.t_size_changed == 0) &&
4995  (team->t.t_proc_bind == new_proc_bind)) {
4996  if (new_proc_bind == proc_bind_spread) {
4997  __kmp_partition_places(
4998  team, 1); // add flag to update only master for spread
4999  }
5000  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5001  "proc_bind = %d, partition = [%d,%d]\n",
5002  team->t.t_id, new_proc_bind, team->t.t_first_place,
5003  team->t.t_last_place));
5004  } else {
5005  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5006  __kmp_partition_places(team);
5007  }
5008 #else
5009  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5010 #endif /* KMP_AFFINITY_SUPPORTED */
5011  } else if (team->t.t_nproc > new_nproc) {
5012  KA_TRACE(20,
5013  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5014  new_nproc));
5015 
5016  team->t.t_size_changed = 1;
5017 #if KMP_NESTED_HOT_TEAMS
5018  if (__kmp_hot_teams_mode == 0) {
5019  // AC: saved number of threads should correspond to team's value in this
5020  // mode, can be bigger in mode 1, when hot team has threads in reserve
5021  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5022  hot_teams[level].hot_team_nth = new_nproc;
5023 #endif // KMP_NESTED_HOT_TEAMS
5024  /* release the extra threads we don't need any more */
5025  for (f = new_nproc; f < team->t.t_nproc; f++) {
5026  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5027  if (__kmp_tasking_mode != tskm_immediate_exec) {
5028  // When decreasing team size, threads no longer in the team should
5029  // unref task team.
5030  team->t.t_threads[f]->th.th_task_team = NULL;
5031  }
5032  __kmp_free_thread(team->t.t_threads[f]);
5033  team->t.t_threads[f] = NULL;
5034  }
5035 #if KMP_NESTED_HOT_TEAMS
5036  } // (__kmp_hot_teams_mode == 0)
5037  else {
5038  // When keeping extra threads in team, switch threads to wait on own
5039  // b_go flag
5040  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5041  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5042  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5043  for (int b = 0; b < bs_last_barrier; ++b) {
5044  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5045  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5046  }
5047  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5048  }
5049  }
5050  }
5051 #endif // KMP_NESTED_HOT_TEAMS
5052  team->t.t_nproc = new_nproc;
5053  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5054  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5055  __kmp_reinitialize_team(team, new_icvs,
5056  root->r.r_uber_thread->th.th_ident);
5057 
5058  // Update remaining threads
5059  for (f = 0; f < new_nproc; ++f) {
5060  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5061  }
5062 
5063  // restore the current task state of the master thread: should be the
5064  // implicit task
5065  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5066  team->t.t_threads[0], team));
5067 
5068  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5069 
5070 #ifdef KMP_DEBUG
5071  for (f = 0; f < team->t.t_nproc; f++) {
5072  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5073  team->t.t_threads[f]->th.th_team_nproc ==
5074  team->t.t_nproc);
5075  }
5076 #endif
5077 
5078  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5079 #if KMP_AFFINITY_SUPPORTED
5080  __kmp_partition_places(team);
5081 #endif
5082  } else { // team->t.t_nproc < new_nproc
5083 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5084  kmp_affin_mask_t *old_mask;
5085  if (KMP_AFFINITY_CAPABLE()) {
5086  KMP_CPU_ALLOC(old_mask);
5087  }
5088 #endif
5089 
5090  KA_TRACE(20,
5091  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5092  new_nproc));
5093 
5094  team->t.t_size_changed = 1;
5095 
5096 #if KMP_NESTED_HOT_TEAMS
5097  int avail_threads = hot_teams[level].hot_team_nth;
5098  if (new_nproc < avail_threads)
5099  avail_threads = new_nproc;
5100  kmp_info_t **other_threads = team->t.t_threads;
5101  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5102  // Adjust barrier data of reserved threads (if any) of the team
5103  // Other data will be set in __kmp_initialize_info() below.
5104  int b;
5105  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5106  for (b = 0; b < bs_last_barrier; ++b) {
5107  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5108  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5109 #if USE_DEBUGGER
5110  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5111 #endif
5112  }
5113  }
5114  if (hot_teams[level].hot_team_nth >= new_nproc) {
5115  // we have all needed threads in reserve, no need to allocate any
5116  // this only possible in mode 1, cannot have reserved threads in mode 0
5117  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5118  team->t.t_nproc = new_nproc; // just get reserved threads involved
5119  } else {
5120  // we may have some threads in reserve, but not enough
5121  team->t.t_nproc =
5122  hot_teams[level]
5123  .hot_team_nth; // get reserved threads involved if any
5124  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5125 #endif // KMP_NESTED_HOT_TEAMS
5126  if (team->t.t_max_nproc < new_nproc) {
5127  /* reallocate larger arrays */
5128  __kmp_reallocate_team_arrays(team, new_nproc);
5129  __kmp_reinitialize_team(team, new_icvs, NULL);
5130  }
5131 
5132 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5133  /* Temporarily set full mask for master thread before creation of
5134  workers. The reason is that workers inherit the affinity from master,
5135  so if a lot of workers are created on the single core quickly, they
5136  don't get a chance to set their own affinity for a long time. */
5137  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5138 #endif
5139 
5140  /* allocate new threads for the hot team */
5141  for (f = team->t.t_nproc; f < new_nproc; f++) {
5142  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5143  KMP_DEBUG_ASSERT(new_worker);
5144  team->t.t_threads[f] = new_worker;
5145 
5146  KA_TRACE(20,
5147  ("__kmp_allocate_team: team %d init T#%d arrived: "
5148  "join=%llu, plain=%llu\n",
5149  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5150  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5151  team->t.t_bar[bs_plain_barrier].b_arrived));
5152 
5153  { // Initialize barrier data for new threads.
5154  int b;
5155  kmp_balign_t *balign = new_worker->th.th_bar;
5156  for (b = 0; b < bs_last_barrier; ++b) {
5157  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5158  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5159  KMP_BARRIER_PARENT_FLAG);
5160 #if USE_DEBUGGER
5161  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5162 #endif
5163  }
5164  }
5165  }
5166 
5167 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5168  if (KMP_AFFINITY_CAPABLE()) {
5169  /* Restore initial master thread's affinity mask */
5170  __kmp_set_system_affinity(old_mask, TRUE);
5171  KMP_CPU_FREE(old_mask);
5172  }
5173 #endif
5174 #if KMP_NESTED_HOT_TEAMS
5175  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5176 #endif // KMP_NESTED_HOT_TEAMS
5177  /* make sure everyone is syncronized */
5178  int old_nproc = team->t.t_nproc; // save old value and use to update only
5179  // new threads below
5180  __kmp_initialize_team(team, new_nproc, new_icvs,
5181  root->r.r_uber_thread->th.th_ident);
5182 
5183  /* reinitialize the threads */
5184  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5185  for (f = 0; f < team->t.t_nproc; ++f)
5186  __kmp_initialize_info(team->t.t_threads[f], team, f,
5187  __kmp_gtid_from_tid(f, team));
5188 
5189  if (level) { // set th_task_state for new threads in nested hot team
5190  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5191  // only need to set the th_task_state for the new threads. th_task_state
5192  // for master thread will not be accurate until after this in
5193  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5194  // correct value.
5195  for (f = old_nproc; f < team->t.t_nproc; ++f)
5196  team->t.t_threads[f]->th.th_task_state =
5197  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5198  } else { // set th_task_state for new threads in non-nested hot team
5199  int old_state =
5200  team->t.t_threads[0]->th.th_task_state; // copy master's state
5201  for (f = old_nproc; f < team->t.t_nproc; ++f)
5202  team->t.t_threads[f]->th.th_task_state = old_state;
5203  }
5204 
5205 #ifdef KMP_DEBUG
5206  for (f = 0; f < team->t.t_nproc; ++f) {
5207  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5208  team->t.t_threads[f]->th.th_team_nproc ==
5209  team->t.t_nproc);
5210  }
5211 #endif
5212 
5213  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5214 #if KMP_AFFINITY_SUPPORTED
5215  __kmp_partition_places(team);
5216 #endif
5217  } // Check changes in number of threads
5218 
5219  kmp_info_t *master = team->t.t_threads[0];
5220  if (master->th.th_teams_microtask) {
5221  for (f = 1; f < new_nproc; ++f) {
5222  // propagate teams construct specific info to workers
5223  kmp_info_t *thr = team->t.t_threads[f];
5224  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5225  thr->th.th_teams_level = master->th.th_teams_level;
5226  thr->th.th_teams_size = master->th.th_teams_size;
5227  }
5228  }
5229 #if KMP_NESTED_HOT_TEAMS
5230  if (level) {
5231  // Sync barrier state for nested hot teams, not needed for outermost hot
5232  // team.
5233  for (f = 1; f < new_nproc; ++f) {
5234  kmp_info_t *thr = team->t.t_threads[f];
5235  int b;
5236  kmp_balign_t *balign = thr->th.th_bar;
5237  for (b = 0; b < bs_last_barrier; ++b) {
5238  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5239  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5240 #if USE_DEBUGGER
5241  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5242 #endif
5243  }
5244  }
5245  }
5246 #endif // KMP_NESTED_HOT_TEAMS
5247 
5248  /* reallocate space for arguments if necessary */
5249  __kmp_alloc_argv_entries(argc, team, TRUE);
5250  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5251  // The hot team re-uses the previous task team,
5252  // if untouched during the previous release->gather phase.
5253 
5254  KF_TRACE(10, (" hot_team = %p\n", team));
5255 
5256 #if KMP_DEBUG
5257  if (__kmp_tasking_mode != tskm_immediate_exec) {
5258  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5259  "task_team[1] = %p after reinit\n",
5260  team->t.t_task_team[0], team->t.t_task_team[1]));
5261  }
5262 #endif
5263 
5264 #if OMPT_SUPPORT
5265  __ompt_team_assign_id(team, ompt_parallel_data);
5266 #endif
5267 
5268  KMP_MB();
5269 
5270  return team;
5271  }
5272 
5273  /* next, let's try to take one from the team pool */
5274  KMP_MB();
5275  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5276  /* TODO: consider resizing undersized teams instead of reaping them, now
5277  that we have a resizing mechanism */
5278  if (team->t.t_max_nproc >= max_nproc) {
5279  /* take this team from the team pool */
5280  __kmp_team_pool = team->t.t_next_pool;
5281 
5282  /* setup the team for fresh use */
5283  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5284 
5285  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5286  "task_team[1] %p to NULL\n",
5287  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5288  team->t.t_task_team[0] = NULL;
5289  team->t.t_task_team[1] = NULL;
5290 
5291  /* reallocate space for arguments if necessary */
5292  __kmp_alloc_argv_entries(argc, team, TRUE);
5293  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5294 
5295  KA_TRACE(
5296  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5297  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5298  { // Initialize barrier data.
5299  int b;
5300  for (b = 0; b < bs_last_barrier; ++b) {
5301  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5302 #if USE_DEBUGGER
5303  team->t.t_bar[b].b_master_arrived = 0;
5304  team->t.t_bar[b].b_team_arrived = 0;
5305 #endif
5306  }
5307  }
5308 
5309  team->t.t_proc_bind = new_proc_bind;
5310 
5311  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5312  team->t.t_id));
5313 
5314 #if OMPT_SUPPORT
5315  __ompt_team_assign_id(team, ompt_parallel_data);
5316 #endif
5317 
5318  KMP_MB();
5319 
5320  return team;
5321  }
5322 
5323  /* reap team if it is too small, then loop back and check the next one */
5324  // not sure if this is wise, but, will be redone during the hot-teams
5325  // rewrite.
5326  /* TODO: Use technique to find the right size hot-team, don't reap them */
5327  team = __kmp_reap_team(team);
5328  __kmp_team_pool = team;
5329  }
5330 
5331  /* nothing available in the pool, no matter, make a new team! */
5332  KMP_MB();
5333  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5334 
5335  /* and set it up */
5336  team->t.t_max_nproc = max_nproc;
5337  /* NOTE well, for some reason allocating one big buffer and dividing it up
5338  seems to really hurt performance a lot on the P4, so, let's not use this */
5339  __kmp_allocate_team_arrays(team, max_nproc);
5340 
5341  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5342  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5343 
5344  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5345  "%p to NULL\n",
5346  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5347  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5348  // memory, no need to duplicate
5349  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5350  // memory, no need to duplicate
5351 
5352  if (__kmp_storage_map) {
5353  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5354  }
5355 
5356  /* allocate space for arguments */
5357  __kmp_alloc_argv_entries(argc, team, FALSE);
5358  team->t.t_argc = argc;
5359 
5360  KA_TRACE(20,
5361  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5362  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5363  { // Initialize barrier data.
5364  int b;
5365  for (b = 0; b < bs_last_barrier; ++b) {
5366  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5367 #if USE_DEBUGGER
5368  team->t.t_bar[b].b_master_arrived = 0;
5369  team->t.t_bar[b].b_team_arrived = 0;
5370 #endif
5371  }
5372  }
5373 
5374  team->t.t_proc_bind = new_proc_bind;
5375 
5376 #if OMPT_SUPPORT
5377  __ompt_team_assign_id(team, ompt_parallel_data);
5378  team->t.ompt_serialized_team_info = NULL;
5379 #endif
5380 
5381  KMP_MB();
5382 
5383  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5384  team->t.t_id));
5385 
5386  return team;
5387 }
5388 
5389 /* TODO implement hot-teams at all levels */
5390 /* TODO implement lazy thread release on demand (disband request) */
5391 
5392 /* free the team. return it to the team pool. release all the threads
5393  * associated with it */
5394 void __kmp_free_team(kmp_root_t *root,
5395  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5396  int f;
5397  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5398  team->t.t_id));
5399 
5400  /* verify state */
5401  KMP_DEBUG_ASSERT(root);
5402  KMP_DEBUG_ASSERT(team);
5403  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5404  KMP_DEBUG_ASSERT(team->t.t_threads);
5405 
5406  int use_hot_team = team == root->r.r_hot_team;
5407 #if KMP_NESTED_HOT_TEAMS
5408  int level;
5409  kmp_hot_team_ptr_t *hot_teams;
5410  if (master) {
5411  level = team->t.t_active_level - 1;
5412  if (master->th.th_teams_microtask) { // in teams construct?
5413  if (master->th.th_teams_size.nteams > 1) {
5414  ++level; // level was not increased in teams construct for
5415  // team_of_masters
5416  }
5417  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5418  master->th.th_teams_level == team->t.t_level) {
5419  ++level; // level was not increased in teams construct for
5420  // team_of_workers before the parallel
5421  } // team->t.t_level will be increased inside parallel
5422  }
5423  hot_teams = master->th.th_hot_teams;
5424  if (level < __kmp_hot_teams_max_level) {
5425  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5426  use_hot_team = 1;
5427  }
5428  }
5429 #endif // KMP_NESTED_HOT_TEAMS
5430 
5431  /* team is done working */
5432  TCW_SYNC_PTR(team->t.t_pkfn,
5433  NULL); // Important for Debugging Support Library.
5434 #if KMP_OS_WINDOWS
5435  team->t.t_copyin_counter = 0; // init counter for possible reuse
5436 #endif
5437  // Do not reset pointer to parent team to NULL for hot teams.
5438 
5439  /* if we are non-hot team, release our threads */
5440  if (!use_hot_team) {
5441  if (__kmp_tasking_mode != tskm_immediate_exec) {
5442  // Wait for threads to reach reapable state
5443  for (f = 1; f < team->t.t_nproc; ++f) {
5444  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5445  kmp_info_t *th = team->t.t_threads[f];
5446  volatile kmp_uint32 *state = &th->th.th_reap_state;
5447  while (*state != KMP_SAFE_TO_REAP) {
5448 #if KMP_OS_WINDOWS
5449  // On Windows a thread can be killed at any time, check this
5450  DWORD ecode;
5451  if (!__kmp_is_thread_alive(th, &ecode)) {
5452  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5453  break;
5454  }
5455 #endif
5456  // first check if thread is sleeping
5457  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5458  if (fl.is_sleeping())
5459  fl.resume(__kmp_gtid_from_thread(th));
5460  KMP_CPU_PAUSE();
5461  }
5462  }
5463 
5464  // Delete task teams
5465  int tt_idx;
5466  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5467  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5468  if (task_team != NULL) {
5469  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5470  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5471  team->t.t_threads[f]->th.th_task_team = NULL;
5472  }
5473  KA_TRACE(
5474  20,
5475  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5476  __kmp_get_gtid(), task_team, team->t.t_id));
5477 #if KMP_NESTED_HOT_TEAMS
5478  __kmp_free_task_team(master, task_team);
5479 #endif
5480  team->t.t_task_team[tt_idx] = NULL;
5481  }
5482  }
5483  }
5484 
5485  // Reset pointer to parent team only for non-hot teams.
5486  team->t.t_parent = NULL;
5487  team->t.t_level = 0;
5488  team->t.t_active_level = 0;
5489 
5490  /* free the worker threads */
5491  for (f = 1; f < team->t.t_nproc; ++f) {
5492  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5493  __kmp_free_thread(team->t.t_threads[f]);
5494  team->t.t_threads[f] = NULL;
5495  }
5496 
5497  /* put the team back in the team pool */
5498  /* TODO limit size of team pool, call reap_team if pool too large */
5499  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5500  __kmp_team_pool = (volatile kmp_team_t *)team;
5501  } else { // Check if team was created for the masters in a teams construct
5502  // See if first worker is a CG root
5503  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5504  team->t.t_threads[1]->th.th_cg_roots);
5505  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5506  // Clean up the CG root nodes on workers so that this team can be re-used
5507  for (f = 1; f < team->t.t_nproc; ++f) {
5508  kmp_info_t *thr = team->t.t_threads[f];
5509  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5510  thr->th.th_cg_roots->cg_root == thr);
5511  // Pop current CG root off list
5512  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5513  thr->th.th_cg_roots = tmp->up;
5514  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5515  " up to node %p. cg_nthreads was %d\n",
5516  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5517  int i = tmp->cg_nthreads--;
5518  if (i == 1) {
5519  __kmp_free(tmp); // free CG if we are the last thread in it
5520  }
5521  // Restore current task's thread_limit from CG root
5522  if (thr->th.th_cg_roots)
5523  thr->th.th_current_task->td_icvs.thread_limit =
5524  thr->th.th_cg_roots->cg_thread_limit;
5525  }
5526  }
5527  }
5528 
5529  KMP_MB();
5530 }
5531 
5532 /* reap the team. destroy it, reclaim all its resources and free its memory */
5533 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5534  kmp_team_t *next_pool = team->t.t_next_pool;
5535 
5536  KMP_DEBUG_ASSERT(team);
5537  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5538  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5539  KMP_DEBUG_ASSERT(team->t.t_threads);
5540  KMP_DEBUG_ASSERT(team->t.t_argv);
5541 
5542  /* TODO clean the threads that are a part of this? */
5543 
5544  /* free stuff */
5545  __kmp_free_team_arrays(team);
5546  if (team->t.t_argv != &team->t.t_inline_argv[0])
5547  __kmp_free((void *)team->t.t_argv);
5548  __kmp_free(team);
5549 
5550  KMP_MB();
5551  return next_pool;
5552 }
5553 
5554 // Free the thread. Don't reap it, just place it on the pool of available
5555 // threads.
5556 //
5557 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5558 // binding for the affinity mechanism to be useful.
5559 //
5560 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5561 // However, we want to avoid a potential performance problem by always
5562 // scanning through the list to find the correct point at which to insert
5563 // the thread (potential N**2 behavior). To do this we keep track of the
5564 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5565 // With single-level parallelism, threads will always be added to the tail
5566 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5567 // parallelism, all bets are off and we may need to scan through the entire
5568 // free list.
5569 //
5570 // This change also has a potentially large performance benefit, for some
5571 // applications. Previously, as threads were freed from the hot team, they
5572 // would be placed back on the free list in inverse order. If the hot team
5573 // grew back to it's original size, then the freed thread would be placed
5574 // back on the hot team in reverse order. This could cause bad cache
5575 // locality problems on programs where the size of the hot team regularly
5576 // grew and shrunk.
5577 //
5578 // Now, for single-level parallelism, the OMP tid is always == gtid.
5579 void __kmp_free_thread(kmp_info_t *this_th) {
5580  int gtid;
5581  kmp_info_t **scan;
5582 
5583  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5584  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5585 
5586  KMP_DEBUG_ASSERT(this_th);
5587 
5588  // When moving thread to pool, switch thread to wait on own b_go flag, and
5589  // uninitialized (NULL team).
5590  int b;
5591  kmp_balign_t *balign = this_th->th.th_bar;
5592  for (b = 0; b < bs_last_barrier; ++b) {
5593  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5594  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5595  balign[b].bb.team = NULL;
5596  balign[b].bb.leaf_kids = 0;
5597  }
5598  this_th->th.th_task_state = 0;
5599  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5600 
5601  /* put thread back on the free pool */
5602  TCW_PTR(this_th->th.th_team, NULL);
5603  TCW_PTR(this_th->th.th_root, NULL);
5604  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5605 
5606  while (this_th->th.th_cg_roots) {
5607  this_th->th.th_cg_roots->cg_nthreads--;
5608  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5609  " %p of thread %p to %d\n",
5610  this_th, this_th->th.th_cg_roots,
5611  this_th->th.th_cg_roots->cg_root,
5612  this_th->th.th_cg_roots->cg_nthreads));
5613  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5614  if (tmp->cg_root == this_th) { // Thread is a cg_root
5615  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5616  KA_TRACE(
5617  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5618  this_th->th.th_cg_roots = tmp->up;
5619  __kmp_free(tmp);
5620  } else { // Worker thread
5621  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5622  __kmp_free(tmp);
5623  }
5624  this_th->th.th_cg_roots = NULL;
5625  break;
5626  }
5627  }
5628 
5629  /* If the implicit task assigned to this thread can be used by other threads
5630  * -> multiple threads can share the data and try to free the task at
5631  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5632  * with higher probability when hot team is disabled but can occurs even when
5633  * the hot team is enabled */
5634  __kmp_free_implicit_task(this_th);
5635  this_th->th.th_current_task = NULL;
5636 
5637  // If the __kmp_thread_pool_insert_pt is already past the new insert
5638  // point, then we need to re-scan the entire list.
5639  gtid = this_th->th.th_info.ds.ds_gtid;
5640  if (__kmp_thread_pool_insert_pt != NULL) {
5641  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5642  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5643  __kmp_thread_pool_insert_pt = NULL;
5644  }
5645  }
5646 
5647  // Scan down the list to find the place to insert the thread.
5648  // scan is the address of a link in the list, possibly the address of
5649  // __kmp_thread_pool itself.
5650  //
5651  // In the absence of nested parallelism, the for loop will have 0 iterations.
5652  if (__kmp_thread_pool_insert_pt != NULL) {
5653  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5654  } else {
5655  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5656  }
5657  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5658  scan = &((*scan)->th.th_next_pool))
5659  ;
5660 
5661  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5662  // to its address.
5663  TCW_PTR(this_th->th.th_next_pool, *scan);
5664  __kmp_thread_pool_insert_pt = *scan = this_th;
5665  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5666  (this_th->th.th_info.ds.ds_gtid <
5667  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5668  TCW_4(this_th->th.th_in_pool, TRUE);
5669  __kmp_suspend_initialize_thread(this_th);
5670  __kmp_lock_suspend_mx(this_th);
5671  if (this_th->th.th_active == TRUE) {
5672  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5673  this_th->th.th_active_in_pool = TRUE;
5674  }
5675 #if KMP_DEBUG
5676  else {
5677  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5678  }
5679 #endif
5680  __kmp_unlock_suspend_mx(this_th);
5681 
5682  TCW_4(__kmp_nth, __kmp_nth - 1);
5683 
5684 #ifdef KMP_ADJUST_BLOCKTIME
5685  /* Adjust blocktime back to user setting or default if necessary */
5686  /* Middle initialization might never have occurred */
5687  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5688  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5689  if (__kmp_nth <= __kmp_avail_proc) {
5690  __kmp_zero_bt = FALSE;
5691  }
5692  }
5693 #endif /* KMP_ADJUST_BLOCKTIME */
5694 
5695  KMP_MB();
5696 }
5697 
5698 /* ------------------------------------------------------------------------ */
5699 
5700 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5701  int gtid = this_thr->th.th_info.ds.ds_gtid;
5702  /* void *stack_data;*/
5703  kmp_team_t **volatile pteam;
5704 
5705  KMP_MB();
5706  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5707 
5708  if (__kmp_env_consistency_check) {
5709  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5710  }
5711 
5712 #if OMPT_SUPPORT
5713  ompt_data_t *thread_data;
5714  if (ompt_enabled.enabled) {
5715  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5716  *thread_data = ompt_data_none;
5717 
5718  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5719  this_thr->th.ompt_thread_info.wait_id = 0;
5720  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5721  this_thr->th.ompt_thread_info.parallel_flags = 0;
5722  if (ompt_enabled.ompt_callback_thread_begin) {
5723  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5724  ompt_thread_worker, thread_data);
5725  }
5726  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5727  }
5728 #endif
5729 
5730  /* This is the place where threads wait for work */
5731  while (!TCR_4(__kmp_global.g.g_done)) {
5732  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5733  KMP_MB();
5734 
5735  /* wait for work to do */
5736  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5737 
5738  /* No tid yet since not part of a team */
5739  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5740 
5741 #if OMPT_SUPPORT
5742  if (ompt_enabled.enabled) {
5743  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5744  }
5745 #endif
5746 
5747  pteam = &this_thr->th.th_team;
5748 
5749  /* have we been allocated? */
5750  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5751  /* we were just woken up, so run our new task */
5752  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5753  int rc;
5754  KA_TRACE(20,
5755  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5756  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5757  (*pteam)->t.t_pkfn));
5758 
5759  updateHWFPControl(*pteam);
5760 
5761 #if OMPT_SUPPORT
5762  if (ompt_enabled.enabled) {
5763  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5764  }
5765 #endif
5766 
5767  rc = (*pteam)->t.t_invoke(gtid);
5768  KMP_ASSERT(rc);
5769 
5770  KMP_MB();
5771  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5772  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5773  (*pteam)->t.t_pkfn));
5774  }
5775 #if OMPT_SUPPORT
5776  if (ompt_enabled.enabled) {
5777  /* no frame set while outside task */
5778  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5779 
5780  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5781  }
5782 #endif
5783  /* join barrier after parallel region */
5784  __kmp_join_barrier(gtid);
5785  }
5786  }
5787  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5788 
5789 #if OMPT_SUPPORT
5790  if (ompt_enabled.ompt_callback_thread_end) {
5791  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5792  }
5793 #endif
5794 
5795  this_thr->th.th_task_team = NULL;
5796  /* run the destructors for the threadprivate data for this thread */
5797  __kmp_common_destroy_gtid(gtid);
5798 
5799  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5800  KMP_MB();
5801  return this_thr;
5802 }
5803 
5804 /* ------------------------------------------------------------------------ */
5805 
5806 void __kmp_internal_end_dest(void *specific_gtid) {
5807 #if KMP_COMPILER_ICC
5808 #pragma warning(push)
5809 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5810 // significant bits
5811 #endif
5812  // Make sure no significant bits are lost
5813  int gtid = (kmp_intptr_t)specific_gtid - 1;
5814 #if KMP_COMPILER_ICC
5815 #pragma warning(pop)
5816 #endif
5817 
5818  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5819  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5820  * this is because 0 is reserved for the nothing-stored case */
5821 
5822  /* josh: One reason for setting the gtid specific data even when it is being
5823  destroyed by pthread is to allow gtid lookup through thread specific data
5824  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5825  that gets executed in the call to __kmp_internal_end_thread, actually
5826  gets the gtid through the thread specific data. Setting it here seems
5827  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5828  to run smoothly.
5829  todo: get rid of this after we remove the dependence on
5830  __kmp_gtid_get_specific */
5831  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5832  __kmp_gtid_set_specific(gtid);
5833 #ifdef KMP_TDATA_GTID
5834  __kmp_gtid = gtid;
5835 #endif
5836  __kmp_internal_end_thread(gtid);
5837 }
5838 
5839 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5840 
5841 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5842  __kmp_internal_end_atexit();
5843 }
5844 
5845 #endif
5846 
5847 /* [Windows] josh: when the atexit handler is called, there may still be more
5848  than one thread alive */
5849 void __kmp_internal_end_atexit(void) {
5850  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5851  /* [Windows]
5852  josh: ideally, we want to completely shutdown the library in this atexit
5853  handler, but stat code that depends on thread specific data for gtid fails
5854  because that data becomes unavailable at some point during the shutdown, so
5855  we call __kmp_internal_end_thread instead. We should eventually remove the
5856  dependency on __kmp_get_specific_gtid in the stat code and use
5857  __kmp_internal_end_library to cleanly shutdown the library.
5858 
5859  // TODO: Can some of this comment about GVS be removed?
5860  I suspect that the offending stat code is executed when the calling thread
5861  tries to clean up a dead root thread's data structures, resulting in GVS
5862  code trying to close the GVS structures for that thread, but since the stat
5863  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5864  the calling thread is cleaning up itself instead of another thread, it get
5865  confused. This happens because allowing a thread to unregister and cleanup
5866  another thread is a recent modification for addressing an issue.
5867  Based on the current design (20050722), a thread may end up
5868  trying to unregister another thread only if thread death does not trigger
5869  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5870  thread specific data destructor function to detect thread death. For
5871  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5872  is nothing. Thus, the workaround is applicable only for Windows static
5873  stat library. */
5874  __kmp_internal_end_library(-1);
5875 #if KMP_OS_WINDOWS
5876  __kmp_close_console();
5877 #endif
5878 }
5879 
5880 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5881  // It is assumed __kmp_forkjoin_lock is acquired.
5882 
5883  int gtid;
5884 
5885  KMP_DEBUG_ASSERT(thread != NULL);
5886 
5887  gtid = thread->th.th_info.ds.ds_gtid;
5888 
5889  if (!is_root) {
5890  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5891  /* Assume the threads are at the fork barrier here */
5892  KA_TRACE(
5893  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5894  gtid));
5895  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5896  * (GEH) */
5897  ANNOTATE_HAPPENS_BEFORE(thread);
5898  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5899  __kmp_release_64(&flag);
5900  }
5901 
5902  // Terminate OS thread.
5903  __kmp_reap_worker(thread);
5904 
5905  // The thread was killed asynchronously. If it was actively
5906  // spinning in the thread pool, decrement the global count.
5907  //
5908  // There is a small timing hole here - if the worker thread was just waking
5909  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5910  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5911  // the global counter might not get updated.
5912  //
5913  // Currently, this can only happen as the library is unloaded,
5914  // so there are no harmful side effects.
5915  if (thread->th.th_active_in_pool) {
5916  thread->th.th_active_in_pool = FALSE;
5917  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5918  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5919  }
5920  }
5921 
5922  __kmp_free_implicit_task(thread);
5923 
5924 // Free the fast memory for tasking
5925 #if USE_FAST_MEMORY
5926  __kmp_free_fast_memory(thread);
5927 #endif /* USE_FAST_MEMORY */
5928 
5929  __kmp_suspend_uninitialize_thread(thread);
5930 
5931  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5932  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5933 
5934  --__kmp_all_nth;
5935 // __kmp_nth was decremented when thread is added to the pool.
5936 
5937 #ifdef KMP_ADJUST_BLOCKTIME
5938  /* Adjust blocktime back to user setting or default if necessary */
5939  /* Middle initialization might never have occurred */
5940  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5941  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5942  if (__kmp_nth <= __kmp_avail_proc) {
5943  __kmp_zero_bt = FALSE;
5944  }
5945  }
5946 #endif /* KMP_ADJUST_BLOCKTIME */
5947 
5948  /* free the memory being used */
5949  if (__kmp_env_consistency_check) {
5950  if (thread->th.th_cons) {
5951  __kmp_free_cons_stack(thread->th.th_cons);
5952  thread->th.th_cons = NULL;
5953  }
5954  }
5955 
5956  if (thread->th.th_pri_common != NULL) {
5957  __kmp_free(thread->th.th_pri_common);
5958  thread->th.th_pri_common = NULL;
5959  }
5960 
5961  if (thread->th.th_task_state_memo_stack != NULL) {
5962  __kmp_free(thread->th.th_task_state_memo_stack);
5963  thread->th.th_task_state_memo_stack = NULL;
5964  }
5965 
5966 #if KMP_USE_BGET
5967  if (thread->th.th_local.bget_data != NULL) {
5968  __kmp_finalize_bget(thread);
5969  }
5970 #endif
5971 
5972 #if KMP_AFFINITY_SUPPORTED
5973  if (thread->th.th_affin_mask != NULL) {
5974  KMP_CPU_FREE(thread->th.th_affin_mask);
5975  thread->th.th_affin_mask = NULL;
5976  }
5977 #endif /* KMP_AFFINITY_SUPPORTED */
5978 
5979 #if KMP_USE_HIER_SCHED
5980  if (thread->th.th_hier_bar_data != NULL) {
5981  __kmp_free(thread->th.th_hier_bar_data);
5982  thread->th.th_hier_bar_data = NULL;
5983  }
5984 #endif
5985 
5986  __kmp_reap_team(thread->th.th_serial_team);
5987  thread->th.th_serial_team = NULL;
5988  __kmp_free(thread);
5989 
5990  KMP_MB();
5991 
5992 } // __kmp_reap_thread
5993 
5994 static void __kmp_internal_end(void) {
5995  int i;
5996 
5997  /* First, unregister the library */
5998  __kmp_unregister_library();
5999 
6000 #if KMP_OS_WINDOWS
6001  /* In Win static library, we can't tell when a root actually dies, so we
6002  reclaim the data structures for any root threads that have died but not
6003  unregistered themselves, in order to shut down cleanly.
6004  In Win dynamic library we also can't tell when a thread dies. */
6005  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6006 // dead roots
6007 #endif
6008 
6009  for (i = 0; i < __kmp_threads_capacity; i++)
6010  if (__kmp_root[i])
6011  if (__kmp_root[i]->r.r_active)
6012  break;
6013  KMP_MB(); /* Flush all pending memory write invalidates. */
6014  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6015 
6016  if (i < __kmp_threads_capacity) {
6017 #if KMP_USE_MONITOR
6018  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6019  KMP_MB(); /* Flush all pending memory write invalidates. */
6020 
6021  // Need to check that monitor was initialized before reaping it. If we are
6022  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6023  // __kmp_monitor will appear to contain valid data, but it is only valid in
6024  // the parent process, not the child.
6025  // New behavior (201008): instead of keying off of the flag
6026  // __kmp_init_parallel, the monitor thread creation is keyed off
6027  // of the new flag __kmp_init_monitor.
6028  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6029  if (TCR_4(__kmp_init_monitor)) {
6030  __kmp_reap_monitor(&__kmp_monitor);
6031  TCW_4(__kmp_init_monitor, 0);
6032  }
6033  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6034  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6035 #endif // KMP_USE_MONITOR
6036  } else {
6037 /* TODO move this to cleanup code */
6038 #ifdef KMP_DEBUG
6039  /* make sure that everything has properly ended */
6040  for (i = 0; i < __kmp_threads_capacity; i++) {
6041  if (__kmp_root[i]) {
6042  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6043  // there can be uber threads alive here
6044  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6045  }
6046  }
6047 #endif
6048 
6049  KMP_MB();
6050 
6051  // Reap the worker threads.
6052  // This is valid for now, but be careful if threads are reaped sooner.
6053  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6054  // Get the next thread from the pool.
6055  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6056  __kmp_thread_pool = thread->th.th_next_pool;
6057  // Reap it.
6058  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6059  thread->th.th_next_pool = NULL;
6060  thread->th.th_in_pool = FALSE;
6061  __kmp_reap_thread(thread, 0);
6062  }
6063  __kmp_thread_pool_insert_pt = NULL;
6064 
6065  // Reap teams.
6066  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6067  // Get the next team from the pool.
6068  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6069  __kmp_team_pool = team->t.t_next_pool;
6070  // Reap it.
6071  team->t.t_next_pool = NULL;
6072  __kmp_reap_team(team);
6073  }
6074 
6075  __kmp_reap_task_teams();
6076 
6077 #if KMP_OS_UNIX
6078  // Threads that are not reaped should not access any resources since they
6079  // are going to be deallocated soon, so the shutdown sequence should wait
6080  // until all threads either exit the final spin-waiting loop or begin
6081  // sleeping after the given blocktime.
6082  for (i = 0; i < __kmp_threads_capacity; i++) {
6083  kmp_info_t *thr = __kmp_threads[i];
6084  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6085  KMP_CPU_PAUSE();
6086  }
6087 #endif
6088 
6089  for (i = 0; i < __kmp_threads_capacity; ++i) {
6090  // TBD: Add some checking...
6091  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6092  }
6093 
6094  /* Make sure all threadprivate destructors get run by joining with all
6095  worker threads before resetting this flag */
6096  TCW_SYNC_4(__kmp_init_common, FALSE);
6097 
6098  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6099  KMP_MB();
6100 
6101 #if KMP_USE_MONITOR
6102  // See note above: One of the possible fixes for CQ138434 / CQ140126
6103  //
6104  // FIXME: push both code fragments down and CSE them?
6105  // push them into __kmp_cleanup() ?
6106  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6107  if (TCR_4(__kmp_init_monitor)) {
6108  __kmp_reap_monitor(&__kmp_monitor);
6109  TCW_4(__kmp_init_monitor, 0);
6110  }
6111  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6112  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6113 #endif
6114  } /* else !__kmp_global.t_active */
6115  TCW_4(__kmp_init_gtid, FALSE);
6116  KMP_MB(); /* Flush all pending memory write invalidates. */
6117 
6118  __kmp_cleanup();
6119 #if OMPT_SUPPORT
6120  ompt_fini();
6121 #endif
6122 }
6123 
6124 void __kmp_internal_end_library(int gtid_req) {
6125  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6126  /* this shouldn't be a race condition because __kmp_internal_end() is the
6127  only place to clear __kmp_serial_init */
6128  /* we'll check this later too, after we get the lock */
6129  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6130  // redundant, because the next check will work in any case.
6131  if (__kmp_global.g.g_abort) {
6132  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6133  /* TODO abort? */
6134  return;
6135  }
6136  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6137  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6138  return;
6139  }
6140 
6141  KMP_MB(); /* Flush all pending memory write invalidates. */
6142 
6143  /* find out who we are and what we should do */
6144  {
6145  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6146  KA_TRACE(
6147  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6148  if (gtid == KMP_GTID_SHUTDOWN) {
6149  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6150  "already shutdown\n"));
6151  return;
6152  } else if (gtid == KMP_GTID_MONITOR) {
6153  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6154  "registered, or system shutdown\n"));
6155  return;
6156  } else if (gtid == KMP_GTID_DNE) {
6157  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6158  "shutdown\n"));
6159  /* we don't know who we are, but we may still shutdown the library */
6160  } else if (KMP_UBER_GTID(gtid)) {
6161  /* unregister ourselves as an uber thread. gtid is no longer valid */
6162  if (__kmp_root[gtid]->r.r_active) {
6163  __kmp_global.g.g_abort = -1;
6164  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6165  KA_TRACE(10,
6166  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6167  gtid));
6168  return;
6169  } else {
6170  KA_TRACE(
6171  10,
6172  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6173  __kmp_unregister_root_current_thread(gtid);
6174  }
6175  } else {
6176 /* worker threads may call this function through the atexit handler, if they
6177  * call exit() */
6178 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6179  TODO: do a thorough shutdown instead */
6180 #ifdef DUMP_DEBUG_ON_EXIT
6181  if (__kmp_debug_buf)
6182  __kmp_dump_debug_buffer();
6183 #endif
6184  return;
6185  }
6186  }
6187  /* synchronize the termination process */
6188  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6189 
6190  /* have we already finished */
6191  if (__kmp_global.g.g_abort) {
6192  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6193  /* TODO abort? */
6194  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6195  return;
6196  }
6197  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6198  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6199  return;
6200  }
6201 
6202  /* We need this lock to enforce mutex between this reading of
6203  __kmp_threads_capacity and the writing by __kmp_register_root.
6204  Alternatively, we can use a counter of roots that is atomically updated by
6205  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6206  __kmp_internal_end_*. */
6207  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6208 
6209  /* now we can safely conduct the actual termination */
6210  __kmp_internal_end();
6211 
6212  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6213  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6214 
6215  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6216 
6217 #ifdef DUMP_DEBUG_ON_EXIT
6218  if (__kmp_debug_buf)
6219  __kmp_dump_debug_buffer();
6220 #endif
6221 
6222 #if KMP_OS_WINDOWS
6223  __kmp_close_console();
6224 #endif
6225 
6226  __kmp_fini_allocator();
6227 
6228 } // __kmp_internal_end_library
6229 
6230 void __kmp_internal_end_thread(int gtid_req) {
6231  int i;
6232 
6233  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6234  /* this shouldn't be a race condition because __kmp_internal_end() is the
6235  * only place to clear __kmp_serial_init */
6236  /* we'll check this later too, after we get the lock */
6237  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6238  // redundant, because the next check will work in any case.
6239  if (__kmp_global.g.g_abort) {
6240  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6241  /* TODO abort? */
6242  return;
6243  }
6244  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6245  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6246  return;
6247  }
6248 
6249  KMP_MB(); /* Flush all pending memory write invalidates. */
6250 
6251  /* find out who we are and what we should do */
6252  {
6253  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6254  KA_TRACE(10,
6255  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6256  if (gtid == KMP_GTID_SHUTDOWN) {
6257  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6258  "already shutdown\n"));
6259  return;
6260  } else if (gtid == KMP_GTID_MONITOR) {
6261  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6262  "registered, or system shutdown\n"));
6263  return;
6264  } else if (gtid == KMP_GTID_DNE) {
6265  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6266  "shutdown\n"));
6267  return;
6268  /* we don't know who we are */
6269  } else if (KMP_UBER_GTID(gtid)) {
6270  /* unregister ourselves as an uber thread. gtid is no longer valid */
6271  if (__kmp_root[gtid]->r.r_active) {
6272  __kmp_global.g.g_abort = -1;
6273  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6274  KA_TRACE(10,
6275  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6276  gtid));
6277  return;
6278  } else {
6279  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6280  gtid));
6281  __kmp_unregister_root_current_thread(gtid);
6282  }
6283  } else {
6284  /* just a worker thread, let's leave */
6285  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6286 
6287  if (gtid >= 0) {
6288  __kmp_threads[gtid]->th.th_task_team = NULL;
6289  }
6290 
6291  KA_TRACE(10,
6292  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6293  gtid));
6294  return;
6295  }
6296  }
6297 #if KMP_DYNAMIC_LIB
6298  if (__kmp_pause_status != kmp_hard_paused)
6299  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6300  // because we will better shutdown later in the library destructor.
6301  {
6302  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6303  return;
6304  }
6305 #endif
6306  /* synchronize the termination process */
6307  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6308 
6309  /* have we already finished */
6310  if (__kmp_global.g.g_abort) {
6311  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6312  /* TODO abort? */
6313  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6314  return;
6315  }
6316  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6317  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6318  return;
6319  }
6320 
6321  /* We need this lock to enforce mutex between this reading of
6322  __kmp_threads_capacity and the writing by __kmp_register_root.
6323  Alternatively, we can use a counter of roots that is atomically updated by
6324  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6325  __kmp_internal_end_*. */
6326 
6327  /* should we finish the run-time? are all siblings done? */
6328  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6329 
6330  for (i = 0; i < __kmp_threads_capacity; ++i) {
6331  if (KMP_UBER_GTID(i)) {
6332  KA_TRACE(
6333  10,
6334  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6335  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6336  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6337  return;
6338  }
6339  }
6340 
6341  /* now we can safely conduct the actual termination */
6342 
6343  __kmp_internal_end();
6344 
6345  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6346  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6347 
6348  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6349 
6350 #ifdef DUMP_DEBUG_ON_EXIT
6351  if (__kmp_debug_buf)
6352  __kmp_dump_debug_buffer();
6353 #endif
6354 } // __kmp_internal_end_thread
6355 
6356 // -----------------------------------------------------------------------------
6357 // Library registration stuff.
6358 
6359 static long __kmp_registration_flag = 0;
6360 // Random value used to indicate library initialization.
6361 static char *__kmp_registration_str = NULL;
6362 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6363 
6364 static inline char *__kmp_reg_status_name() {
6365  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6366  each thread. If registration and unregistration go in different threads
6367  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6368  env var can not be found, because the name will contain different pid. */
6369  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6370 } // __kmp_reg_status_get
6371 
6372 void __kmp_register_library_startup(void) {
6373 
6374  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6375  int done = 0;
6376  union {
6377  double dtime;
6378  long ltime;
6379  } time;
6380 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6381  __kmp_initialize_system_tick();
6382 #endif
6383  __kmp_read_system_time(&time.dtime);
6384  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6385  __kmp_registration_str =
6386  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6387  __kmp_registration_flag, KMP_LIBRARY_FILE);
6388 
6389  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6390  __kmp_registration_str));
6391 
6392  while (!done) {
6393 
6394  char *value = NULL; // Actual value of the environment variable.
6395 
6396  // Set environment variable, but do not overwrite if it is exist.
6397  __kmp_env_set(name, __kmp_registration_str, 0);
6398  // Check the variable is written.
6399  value = __kmp_env_get(name);
6400  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6401 
6402  done = 1; // Ok, environment variable set successfully, exit the loop.
6403 
6404  } else {
6405 
6406  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6407  // Check whether it alive or dead.
6408  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6409  char *tail = value;
6410  char *flag_addr_str = NULL;
6411  char *flag_val_str = NULL;
6412  char const *file_name = NULL;
6413  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6414  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6415  file_name = tail;
6416  if (tail != NULL) {
6417  long *flag_addr = 0;
6418  long flag_val = 0;
6419  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6420  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6421  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6422  // First, check whether environment-encoded address is mapped into
6423  // addr space.
6424  // If so, dereference it to see if it still has the right value.
6425  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6426  neighbor = 1;
6427  } else {
6428  // If not, then we know the other copy of the library is no longer
6429  // running.
6430  neighbor = 2;
6431  }
6432  }
6433  }
6434  switch (neighbor) {
6435  case 0: // Cannot parse environment variable -- neighbor status unknown.
6436  // Assume it is the incompatible format of future version of the
6437  // library. Assume the other library is alive.
6438  // WARN( ... ); // TODO: Issue a warning.
6439  file_name = "unknown library";
6440  KMP_FALLTHROUGH();
6441  // Attention! Falling to the next case. That's intentional.
6442  case 1: { // Neighbor is alive.
6443  // Check it is allowed.
6444  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6445  if (!__kmp_str_match_true(duplicate_ok)) {
6446  // That's not allowed. Issue fatal error.
6447  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6448  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6449  }
6450  KMP_INTERNAL_FREE(duplicate_ok);
6451  __kmp_duplicate_library_ok = 1;
6452  done = 1; // Exit the loop.
6453  } break;
6454  case 2: { // Neighbor is dead.
6455  // Clear the variable and try to register library again.
6456  __kmp_env_unset(name);
6457  } break;
6458  default: { KMP_DEBUG_ASSERT(0); } break;
6459  }
6460  }
6461  KMP_INTERNAL_FREE((void *)value);
6462  }
6463  KMP_INTERNAL_FREE((void *)name);
6464 
6465 } // func __kmp_register_library_startup
6466 
6467 void __kmp_unregister_library(void) {
6468 
6469  char *name = __kmp_reg_status_name();
6470  char *value = __kmp_env_get(name);
6471 
6472  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6473  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6474  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6475  // Ok, this is our variable. Delete it.
6476  __kmp_env_unset(name);
6477  }
6478 
6479  KMP_INTERNAL_FREE(__kmp_registration_str);
6480  KMP_INTERNAL_FREE(value);
6481  KMP_INTERNAL_FREE(name);
6482 
6483  __kmp_registration_flag = 0;
6484  __kmp_registration_str = NULL;
6485 
6486 } // __kmp_unregister_library
6487 
6488 // End of Library registration stuff.
6489 // -----------------------------------------------------------------------------
6490 
6491 #if KMP_MIC_SUPPORTED
6492 
6493 static void __kmp_check_mic_type() {
6494  kmp_cpuid_t cpuid_state = {0};
6495  kmp_cpuid_t *cs_p = &cpuid_state;
6496  __kmp_x86_cpuid(1, 0, cs_p);
6497  // We don't support mic1 at the moment
6498  if ((cs_p->eax & 0xff0) == 0xB10) {
6499  __kmp_mic_type = mic2;
6500  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6501  __kmp_mic_type = mic3;
6502  } else {
6503  __kmp_mic_type = non_mic;
6504  }
6505 }
6506 
6507 #endif /* KMP_MIC_SUPPORTED */
6508 
6509 static void __kmp_do_serial_initialize(void) {
6510  int i, gtid;
6511  int size;
6512 
6513  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6514 
6515  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6516  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6517  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6518  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6519  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6520 
6521 #if OMPT_SUPPORT
6522  ompt_pre_init();
6523 #endif
6524 
6525  __kmp_validate_locks();
6526 
6527  /* Initialize internal memory allocator */
6528  __kmp_init_allocator();
6529 
6530  /* Register the library startup via an environment variable and check to see
6531  whether another copy of the library is already registered. */
6532 
6533  __kmp_register_library_startup();
6534 
6535  /* TODO reinitialization of library */
6536  if (TCR_4(__kmp_global.g.g_done)) {
6537  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6538  }
6539 
6540  __kmp_global.g.g_abort = 0;
6541  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6542 
6543 /* initialize the locks */
6544 #if KMP_USE_ADAPTIVE_LOCKS
6545 #if KMP_DEBUG_ADAPTIVE_LOCKS
6546  __kmp_init_speculative_stats();
6547 #endif
6548 #endif
6549 #if KMP_STATS_ENABLED
6550  __kmp_stats_init();
6551 #endif
6552  __kmp_init_lock(&__kmp_global_lock);
6553  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6554  __kmp_init_lock(&__kmp_debug_lock);
6555  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6556  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6557  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6558  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6559  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6560  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6561  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6562  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6563  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6564  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6565  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6566  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6567  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6568  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6569  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6570 #if KMP_USE_MONITOR
6571  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6572 #endif
6573  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6574 
6575  /* conduct initialization and initial setup of configuration */
6576 
6577  __kmp_runtime_initialize();
6578 
6579 #if KMP_MIC_SUPPORTED
6580  __kmp_check_mic_type();
6581 #endif
6582 
6583 // Some global variable initialization moved here from kmp_env_initialize()
6584 #ifdef KMP_DEBUG
6585  kmp_diag = 0;
6586 #endif
6587  __kmp_abort_delay = 0;
6588 
6589  // From __kmp_init_dflt_team_nth()
6590  /* assume the entire machine will be used */
6591  __kmp_dflt_team_nth_ub = __kmp_xproc;
6592  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6593  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6594  }
6595  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6596  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6597  }
6598  __kmp_max_nth = __kmp_sys_max_nth;
6599  __kmp_cg_max_nth = __kmp_sys_max_nth;
6600  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6601  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6602  __kmp_teams_max_nth = __kmp_sys_max_nth;
6603  }
6604 
6605  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6606  // part
6607  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6608 #if KMP_USE_MONITOR
6609  __kmp_monitor_wakeups =
6610  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6611  __kmp_bt_intervals =
6612  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6613 #endif
6614  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6615  __kmp_library = library_throughput;
6616  // From KMP_SCHEDULE initialization
6617  __kmp_static = kmp_sch_static_balanced;
6618 // AC: do not use analytical here, because it is non-monotonous
6619 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6620 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6621 // need to repeat assignment
6622 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6623 // bit control and barrier method control parts
6624 #if KMP_FAST_REDUCTION_BARRIER
6625 #define kmp_reduction_barrier_gather_bb ((int)1)
6626 #define kmp_reduction_barrier_release_bb ((int)1)
6627 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6628 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6629 #endif // KMP_FAST_REDUCTION_BARRIER
6630  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6631  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6632  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6633  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6634  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6635 #if KMP_FAST_REDUCTION_BARRIER
6636  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6637  // lin_64 ): hyper,1
6638  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6639  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6640  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6641  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6642  }
6643 #endif // KMP_FAST_REDUCTION_BARRIER
6644  }
6645 #if KMP_FAST_REDUCTION_BARRIER
6646 #undef kmp_reduction_barrier_release_pat
6647 #undef kmp_reduction_barrier_gather_pat
6648 #undef kmp_reduction_barrier_release_bb
6649 #undef kmp_reduction_barrier_gather_bb
6650 #endif // KMP_FAST_REDUCTION_BARRIER
6651 #if KMP_MIC_SUPPORTED
6652  if (__kmp_mic_type == mic2) { // KNC
6653  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6654  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6655  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6656  1; // forkjoin release
6657  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6658  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6659  }
6660 #if KMP_FAST_REDUCTION_BARRIER
6661  if (__kmp_mic_type == mic2) { // KNC
6662  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6663  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6664  }
6665 #endif // KMP_FAST_REDUCTION_BARRIER
6666 #endif // KMP_MIC_SUPPORTED
6667 
6668 // From KMP_CHECKS initialization
6669 #ifdef KMP_DEBUG
6670  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6671 #else
6672  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6673 #endif
6674 
6675  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6676  __kmp_foreign_tp = TRUE;
6677 
6678  __kmp_global.g.g_dynamic = FALSE;
6679  __kmp_global.g.g_dynamic_mode = dynamic_default;
6680 
6681  __kmp_env_initialize(NULL);
6682 
6683 // Print all messages in message catalog for testing purposes.
6684 #ifdef KMP_DEBUG
6685  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6686  if (__kmp_str_match_true(val)) {
6687  kmp_str_buf_t buffer;
6688  __kmp_str_buf_init(&buffer);
6689  __kmp_i18n_dump_catalog(&buffer);
6690  __kmp_printf("%s", buffer.str);
6691  __kmp_str_buf_free(&buffer);
6692  }
6693  __kmp_env_free(&val);
6694 #endif
6695 
6696  __kmp_threads_capacity =
6697  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6698  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6699  __kmp_tp_capacity = __kmp_default_tp_capacity(
6700  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6701 
6702  // If the library is shut down properly, both pools must be NULL. Just in
6703  // case, set them to NULL -- some memory may leak, but subsequent code will
6704  // work even if pools are not freed.
6705  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6706  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6707  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6708  __kmp_thread_pool = NULL;
6709  __kmp_thread_pool_insert_pt = NULL;
6710  __kmp_team_pool = NULL;
6711 
6712  /* Allocate all of the variable sized records */
6713  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6714  * expandable */
6715  /* Since allocation is cache-aligned, just add extra padding at the end */
6716  size =
6717  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6718  CACHE_LINE;
6719  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6720  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6721  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6722 
6723  /* init thread counts */
6724  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6725  0); // Asserts fail if the library is reinitializing and
6726  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6727  __kmp_all_nth = 0;
6728  __kmp_nth = 0;
6729 
6730  /* setup the uber master thread and hierarchy */
6731  gtid = __kmp_register_root(TRUE);
6732  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6733  KMP_ASSERT(KMP_UBER_GTID(gtid));
6734  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6735 
6736  KMP_MB(); /* Flush all pending memory write invalidates. */
6737 
6738  __kmp_common_initialize();
6739 
6740 #if KMP_OS_UNIX
6741  /* invoke the child fork handler */
6742  __kmp_register_atfork();
6743 #endif
6744 
6745 #if !KMP_DYNAMIC_LIB
6746  {
6747  /* Invoke the exit handler when the program finishes, only for static
6748  library. For dynamic library, we already have _fini and DllMain. */
6749  int rc = atexit(__kmp_internal_end_atexit);
6750  if (rc != 0) {
6751  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6752  __kmp_msg_null);
6753  }
6754  }
6755 #endif
6756 
6757 #if KMP_HANDLE_SIGNALS
6758 #if KMP_OS_UNIX
6759  /* NOTE: make sure that this is called before the user installs their own
6760  signal handlers so that the user handlers are called first. this way they
6761  can return false, not call our handler, avoid terminating the library, and
6762  continue execution where they left off. */
6763  __kmp_install_signals(FALSE);
6764 #endif /* KMP_OS_UNIX */
6765 #if KMP_OS_WINDOWS
6766  __kmp_install_signals(TRUE);
6767 #endif /* KMP_OS_WINDOWS */
6768 #endif
6769 
6770  /* we have finished the serial initialization */
6771  __kmp_init_counter++;
6772 
6773  __kmp_init_serial = TRUE;
6774 
6775  if (__kmp_settings) {
6776  __kmp_env_print();
6777  }
6778 
6779  if (__kmp_display_env || __kmp_display_env_verbose) {
6780  __kmp_env_print_2();
6781  }
6782 
6783 #if OMPT_SUPPORT
6784  ompt_post_init();
6785 #endif
6786 
6787  KMP_MB();
6788 
6789  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6790 }
6791 
6792 void __kmp_serial_initialize(void) {
6793  if (__kmp_init_serial) {
6794  return;
6795  }
6796  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6797  if (__kmp_init_serial) {
6798  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6799  return;
6800  }
6801  __kmp_do_serial_initialize();
6802  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6803 }
6804 
6805 static void __kmp_do_middle_initialize(void) {
6806  int i, j;
6807  int prev_dflt_team_nth;
6808 
6809  if (!__kmp_init_serial) {
6810  __kmp_do_serial_initialize();
6811  }
6812 
6813  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6814 
6815  // Save the previous value for the __kmp_dflt_team_nth so that
6816  // we can avoid some reinitialization if it hasn't changed.
6817  prev_dflt_team_nth = __kmp_dflt_team_nth;
6818 
6819 #if KMP_AFFINITY_SUPPORTED
6820  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6821  // number of cores on the machine.
6822  __kmp_affinity_initialize();
6823 
6824  // Run through the __kmp_threads array and set the affinity mask
6825  // for each root thread that is currently registered with the RTL.
6826  for (i = 0; i < __kmp_threads_capacity; i++) {
6827  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6828  __kmp_affinity_set_init_mask(i, TRUE);
6829  }
6830  }
6831 #endif /* KMP_AFFINITY_SUPPORTED */
6832 
6833  KMP_ASSERT(__kmp_xproc > 0);
6834  if (__kmp_avail_proc == 0) {
6835  __kmp_avail_proc = __kmp_xproc;
6836  }
6837 
6838  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6839  // correct them now
6840  j = 0;
6841  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6842  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6843  __kmp_avail_proc;
6844  j++;
6845  }
6846 
6847  if (__kmp_dflt_team_nth == 0) {
6848 #ifdef KMP_DFLT_NTH_CORES
6849  // Default #threads = #cores
6850  __kmp_dflt_team_nth = __kmp_ncores;
6851  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6852  "__kmp_ncores (%d)\n",
6853  __kmp_dflt_team_nth));
6854 #else
6855  // Default #threads = #available OS procs
6856  __kmp_dflt_team_nth = __kmp_avail_proc;
6857  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6858  "__kmp_avail_proc(%d)\n",
6859  __kmp_dflt_team_nth));
6860 #endif /* KMP_DFLT_NTH_CORES */
6861  }
6862 
6863  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6864  __kmp_dflt_team_nth = KMP_MIN_NTH;
6865  }
6866  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6867  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6868  }
6869 
6870  // There's no harm in continuing if the following check fails,
6871  // but it indicates an error in the previous logic.
6872  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6873 
6874  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6875  // Run through the __kmp_threads array and set the num threads icv for each
6876  // root thread that is currently registered with the RTL (which has not
6877  // already explicitly set its nthreads-var with a call to
6878  // omp_set_num_threads()).
6879  for (i = 0; i < __kmp_threads_capacity; i++) {
6880  kmp_info_t *thread = __kmp_threads[i];
6881  if (thread == NULL)
6882  continue;
6883  if (thread->th.th_current_task->td_icvs.nproc != 0)
6884  continue;
6885 
6886  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6887  }
6888  }
6889  KA_TRACE(
6890  20,
6891  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6892  __kmp_dflt_team_nth));
6893 
6894 #ifdef KMP_ADJUST_BLOCKTIME
6895  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6896  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6897  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6898  if (__kmp_nth > __kmp_avail_proc) {
6899  __kmp_zero_bt = TRUE;
6900  }
6901  }
6902 #endif /* KMP_ADJUST_BLOCKTIME */
6903 
6904  /* we have finished middle initialization */
6905  TCW_SYNC_4(__kmp_init_middle, TRUE);
6906 
6907  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6908 }
6909 
6910 void __kmp_middle_initialize(void) {
6911  if (__kmp_init_middle) {
6912  return;
6913  }
6914  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6915  if (__kmp_init_middle) {
6916  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6917  return;
6918  }
6919  __kmp_do_middle_initialize();
6920  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6921 }
6922 
6923 void __kmp_parallel_initialize(void) {
6924  int gtid = __kmp_entry_gtid(); // this might be a new root
6925 
6926  /* synchronize parallel initialization (for sibling) */
6927  if (TCR_4(__kmp_init_parallel))
6928  return;
6929  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6930  if (TCR_4(__kmp_init_parallel)) {
6931  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6932  return;
6933  }
6934 
6935  /* TODO reinitialization after we have already shut down */
6936  if (TCR_4(__kmp_global.g.g_done)) {
6937  KA_TRACE(
6938  10,
6939  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
6940  __kmp_infinite_loop();
6941  }
6942 
6943  /* jc: The lock __kmp_initz_lock is already held, so calling
6944  __kmp_serial_initialize would cause a deadlock. So we call
6945  __kmp_do_serial_initialize directly. */
6946  if (!__kmp_init_middle) {
6947  __kmp_do_middle_initialize();
6948  }
6949  __kmp_resume_if_hard_paused();
6950 
6951  /* begin initialization */
6952  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
6953  KMP_ASSERT(KMP_UBER_GTID(gtid));
6954 
6955 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6956  // Save the FP control regs.
6957  // Worker threads will set theirs to these values at thread startup.
6958  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
6959  __kmp_store_mxcsr(&__kmp_init_mxcsr);
6960  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6961 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6962 
6963 #if KMP_OS_UNIX
6964 #if KMP_HANDLE_SIGNALS
6965  /* must be after __kmp_serial_initialize */
6966  __kmp_install_signals(TRUE);
6967 #endif
6968 #endif
6969 
6970  __kmp_suspend_initialize();
6971 
6972 #if defined(USE_LOAD_BALANCE)
6973  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6974  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6975  }
6976 #else
6977  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
6978  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6979  }
6980 #endif
6981 
6982  if (__kmp_version) {
6983  __kmp_print_version_2();
6984  }
6985 
6986  /* we have finished parallel initialization */
6987  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6988 
6989  KMP_MB();
6990  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
6991 
6992  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6993 }
6994 
6995 /* ------------------------------------------------------------------------ */
6996 
6997 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
6998  kmp_team_t *team) {
6999  kmp_disp_t *dispatch;
7000 
7001  KMP_MB();
7002 
7003  /* none of the threads have encountered any constructs, yet. */
7004  this_thr->th.th_local.this_construct = 0;
7005 #if KMP_CACHE_MANAGE
7006  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7007 #endif /* KMP_CACHE_MANAGE */
7008  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7009  KMP_DEBUG_ASSERT(dispatch);
7010  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7011  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7012  // this_thr->th.th_info.ds.ds_tid ] );
7013 
7014  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7015  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7016  if (__kmp_env_consistency_check)
7017  __kmp_push_parallel(gtid, team->t.t_ident);
7018 
7019  KMP_MB(); /* Flush all pending memory write invalidates. */
7020 }
7021 
7022 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7023  kmp_team_t *team) {
7024  if (__kmp_env_consistency_check)
7025  __kmp_pop_parallel(gtid, team->t.t_ident);
7026 
7027  __kmp_finish_implicit_task(this_thr);
7028 }
7029 
7030 int __kmp_invoke_task_func(int gtid) {
7031  int rc;
7032  int tid = __kmp_tid_from_gtid(gtid);
7033  kmp_info_t *this_thr = __kmp_threads[gtid];
7034  kmp_team_t *team = this_thr->th.th_team;
7035 
7036  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7037 #if USE_ITT_BUILD
7038  if (__itt_stack_caller_create_ptr) {
7039  __kmp_itt_stack_callee_enter(
7040  (__itt_caller)
7041  team->t.t_stack_id); // inform ittnotify about entering user's code
7042  }
7043 #endif /* USE_ITT_BUILD */
7044 #if INCLUDE_SSC_MARKS
7045  SSC_MARK_INVOKING();
7046 #endif
7047 
7048 #if OMPT_SUPPORT
7049  void *dummy;
7050  void **exit_frame_p;
7051  ompt_data_t *my_task_data;
7052  ompt_data_t *my_parallel_data;
7053  int ompt_team_size;
7054 
7055  if (ompt_enabled.enabled) {
7056  exit_frame_p = &(
7057  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7058  } else {
7059  exit_frame_p = &dummy;
7060  }
7061 
7062  my_task_data =
7063  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7064  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7065  if (ompt_enabled.ompt_callback_implicit_task) {
7066  ompt_team_size = team->t.t_nproc;
7067  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7068  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7069  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7070  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7071  }
7072 #endif
7073 
7074 #if KMP_STATS_ENABLED
7075  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7076  if (previous_state == stats_state_e::TEAMS_REGION) {
7077  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7078  } else {
7079  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7080  }
7081  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7082 #endif
7083 
7084  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7085  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7086 #if OMPT_SUPPORT
7087  ,
7088  exit_frame_p
7089 #endif
7090  );
7091 #if OMPT_SUPPORT
7092  *exit_frame_p = NULL;
7093  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7094 #endif
7095 
7096 #if KMP_STATS_ENABLED
7097  if (previous_state == stats_state_e::TEAMS_REGION) {
7098  KMP_SET_THREAD_STATE(previous_state);
7099  }
7100  KMP_POP_PARTITIONED_TIMER();
7101 #endif
7102 
7103 #if USE_ITT_BUILD
7104  if (__itt_stack_caller_create_ptr) {
7105  __kmp_itt_stack_callee_leave(
7106  (__itt_caller)
7107  team->t.t_stack_id); // inform ittnotify about leaving user's code
7108  }
7109 #endif /* USE_ITT_BUILD */
7110  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7111 
7112  return rc;
7113 }
7114 
7115 void __kmp_teams_master(int gtid) {
7116  // This routine is called by all master threads in teams construct
7117  kmp_info_t *thr = __kmp_threads[gtid];
7118  kmp_team_t *team = thr->th.th_team;
7119  ident_t *loc = team->t.t_ident;
7120  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7121  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7122  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7123  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7124  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7125 
7126  // This thread is a new CG root. Set up the proper variables.
7127  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7128  tmp->cg_root = thr; // Make thr the CG root
7129  // Init to thread limit that was stored when league masters were forked
7130  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7131  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7132  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7133  " cg_nthreads to 1\n",
7134  thr, tmp));
7135  tmp->up = thr->th.th_cg_roots;
7136  thr->th.th_cg_roots = tmp;
7137 
7138 // Launch league of teams now, but not let workers execute
7139 // (they hang on fork barrier until next parallel)
7140 #if INCLUDE_SSC_MARKS
7141  SSC_MARK_FORKING();
7142 #endif
7143  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7144  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7145  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7146 #if INCLUDE_SSC_MARKS
7147  SSC_MARK_JOINING();
7148 #endif
7149  // If the team size was reduced from the limit, set it to the new size
7150  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7151  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7152  // AC: last parameter "1" eliminates join barrier which won't work because
7153  // worker threads are in a fork barrier waiting for more parallel regions
7154  __kmp_join_call(loc, gtid
7155 #if OMPT_SUPPORT
7156  ,
7157  fork_context_intel
7158 #endif
7159  ,
7160  1);
7161 }
7162 
7163 int __kmp_invoke_teams_master(int gtid) {
7164  kmp_info_t *this_thr = __kmp_threads[gtid];
7165  kmp_team_t *team = this_thr->th.th_team;
7166 #if KMP_DEBUG
7167  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7168  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7169  (void *)__kmp_teams_master);
7170 #endif
7171  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7172 #if OMPT_SUPPORT
7173  int tid = __kmp_tid_from_gtid(gtid);
7174  ompt_data_t *task_data =
7175  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7176  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7177  if (ompt_enabled.ompt_callback_implicit_task) {
7178  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7179  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7180  ompt_task_initial);
7181  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7182  }
7183 #endif
7184  __kmp_teams_master(gtid);
7185 #if OMPT_SUPPORT
7186  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7187 #endif
7188  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7189  return 1;
7190 }
7191 
7192 /* this sets the requested number of threads for the next parallel region
7193  encountered by this team. since this should be enclosed in the forkjoin
7194  critical section it should avoid race conditions with asymmetrical nested
7195  parallelism */
7196 
7197 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7198  kmp_info_t *thr = __kmp_threads[gtid];
7199 
7200  if (num_threads > 0)
7201  thr->th.th_set_nproc = num_threads;
7202 }
7203 
7204 /* this sets the requested number of teams for the teams region and/or
7205  the number of threads for the next parallel region encountered */
7206 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7207  int num_threads) {
7208  kmp_info_t *thr = __kmp_threads[gtid];
7209  KMP_DEBUG_ASSERT(num_teams >= 0);
7210  KMP_DEBUG_ASSERT(num_threads >= 0);
7211 
7212  if (num_teams == 0)
7213  num_teams = 1; // default number of teams is 1.
7214  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7215  if (!__kmp_reserve_warn) {
7216  __kmp_reserve_warn = 1;
7217  __kmp_msg(kmp_ms_warning,
7218  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7219  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7220  }
7221  num_teams = __kmp_teams_max_nth;
7222  }
7223  // Set number of teams (number of threads in the outer "parallel" of the
7224  // teams)
7225  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7226 
7227  // Remember the number of threads for inner parallel regions
7228  if (!TCR_4(__kmp_init_middle))
7229  __kmp_middle_initialize(); // get internal globals calculated
7230  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7231  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7232  if (num_threads == 0) {
7233  num_threads = __kmp_avail_proc / num_teams;
7234  // adjust num_threads w/o warning as it is not user setting
7235  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7236  // no thread_limit clause specified - do not change thread-limit-var ICV
7237  if (num_threads > __kmp_dflt_team_nth) {
7238  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7239  }
7240  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7241  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7242  } // prevent team size to exceed thread-limit-var
7243  if (num_teams * num_threads > __kmp_teams_max_nth) {
7244  num_threads = __kmp_teams_max_nth / num_teams;
7245  }
7246  } else {
7247  // This thread will be the master of the league masters
7248  // Store new thread limit; old limit is saved in th_cg_roots list
7249  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7250  // num_threads = min(num_threads, nthreads-var)
7251  if (num_threads > __kmp_dflt_team_nth) {
7252  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7253  }
7254  if (num_teams * num_threads > __kmp_teams_max_nth) {
7255  int new_threads = __kmp_teams_max_nth / num_teams;
7256  if (!__kmp_reserve_warn) { // user asked for too many threads
7257  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7258  __kmp_msg(kmp_ms_warning,
7259  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7260  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7261  }
7262  num_threads = new_threads;
7263  }
7264  }
7265  thr->th.th_teams_size.nth = num_threads;
7266 }
7267 
7268 // Set the proc_bind var to use in the following parallel region.
7269 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7270  kmp_info_t *thr = __kmp_threads[gtid];
7271  thr->th.th_set_proc_bind = proc_bind;
7272 }
7273 
7274 /* Launch the worker threads into the microtask. */
7275 
7276 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7277  kmp_info_t *this_thr = __kmp_threads[gtid];
7278 
7279 #ifdef KMP_DEBUG
7280  int f;
7281 #endif /* KMP_DEBUG */
7282 
7283  KMP_DEBUG_ASSERT(team);
7284  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7285  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7286  KMP_MB(); /* Flush all pending memory write invalidates. */
7287 
7288  team->t.t_construct = 0; /* no single directives seen yet */
7289  team->t.t_ordered.dt.t_value =
7290  0; /* thread 0 enters the ordered section first */
7291 
7292  /* Reset the identifiers on the dispatch buffer */
7293  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7294  if (team->t.t_max_nproc > 1) {
7295  int i;
7296  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7297  team->t.t_disp_buffer[i].buffer_index = i;
7298  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7299  }
7300  } else {
7301  team->t.t_disp_buffer[0].buffer_index = 0;
7302  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7303  }
7304 
7305  KMP_MB(); /* Flush all pending memory write invalidates. */
7306  KMP_ASSERT(this_thr->th.th_team == team);
7307 
7308 #ifdef KMP_DEBUG
7309  for (f = 0; f < team->t.t_nproc; f++) {
7310  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7311  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7312  }
7313 #endif /* KMP_DEBUG */
7314 
7315  /* release the worker threads so they may begin working */
7316  __kmp_fork_barrier(gtid, 0);
7317 }
7318 
7319 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7320  kmp_info_t *this_thr = __kmp_threads[gtid];
7321 
7322  KMP_DEBUG_ASSERT(team);
7323  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7324  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7325  KMP_MB(); /* Flush all pending memory write invalidates. */
7326 
7327 /* Join barrier after fork */
7328 
7329 #ifdef KMP_DEBUG
7330  if (__kmp_threads[gtid] &&
7331  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7332  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7333  __kmp_threads[gtid]);
7334  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7335  "team->t.t_nproc=%d\n",
7336  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7337  team->t.t_nproc);
7338  __kmp_print_structure();
7339  }
7340  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7341  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7342 #endif /* KMP_DEBUG */
7343 
7344  __kmp_join_barrier(gtid); /* wait for everyone */
7345 #if OMPT_SUPPORT
7346  if (ompt_enabled.enabled &&
7347  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7348  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7349  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7350  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7351 #if OMPT_OPTIONAL
7352  void *codeptr = NULL;
7353  if (KMP_MASTER_TID(ds_tid) &&
7354  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7355  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7356  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7357 
7358  if (ompt_enabled.ompt_callback_sync_region_wait) {
7359  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7360  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7361  codeptr);
7362  }
7363  if (ompt_enabled.ompt_callback_sync_region) {
7364  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7365  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7366  codeptr);
7367  }
7368 #endif
7369  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7370  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7371  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7372  }
7373  }
7374 #endif
7375 
7376  KMP_MB(); /* Flush all pending memory write invalidates. */
7377  KMP_ASSERT(this_thr->th.th_team == team);
7378 }
7379 
7380 /* ------------------------------------------------------------------------ */
7381 
7382 #ifdef USE_LOAD_BALANCE
7383 
7384 // Return the worker threads actively spinning in the hot team, if we
7385 // are at the outermost level of parallelism. Otherwise, return 0.
7386 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7387  int i;
7388  int retval;
7389  kmp_team_t *hot_team;
7390 
7391  if (root->r.r_active) {
7392  return 0;
7393  }
7394  hot_team = root->r.r_hot_team;
7395  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7396  return hot_team->t.t_nproc - 1; // Don't count master thread
7397  }
7398 
7399  // Skip the master thread - it is accounted for elsewhere.
7400  retval = 0;
7401  for (i = 1; i < hot_team->t.t_nproc; i++) {
7402  if (hot_team->t.t_threads[i]->th.th_active) {
7403  retval++;
7404  }
7405  }
7406  return retval;
7407 }
7408 
7409 // Perform an automatic adjustment to the number of
7410 // threads used by the next parallel region.
7411 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7412  int retval;
7413  int pool_active;
7414  int hot_team_active;
7415  int team_curr_active;
7416  int system_active;
7417 
7418  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7419  set_nproc));
7420  KMP_DEBUG_ASSERT(root);
7421  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7422  ->th.th_current_task->td_icvs.dynamic == TRUE);
7423  KMP_DEBUG_ASSERT(set_nproc > 1);
7424 
7425  if (set_nproc == 1) {
7426  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7427  return 1;
7428  }
7429 
7430  // Threads that are active in the thread pool, active in the hot team for this
7431  // particular root (if we are at the outer par level), and the currently
7432  // executing thread (to become the master) are available to add to the new
7433  // team, but are currently contributing to the system load, and must be
7434  // accounted for.
7435  pool_active = __kmp_thread_pool_active_nth;
7436  hot_team_active = __kmp_active_hot_team_nproc(root);
7437  team_curr_active = pool_active + hot_team_active + 1;
7438 
7439  // Check the system load.
7440  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7441  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7442  "hot team active = %d\n",
7443  system_active, pool_active, hot_team_active));
7444 
7445  if (system_active < 0) {
7446  // There was an error reading the necessary info from /proc, so use the
7447  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7448  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7449  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7450  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7451 
7452  // Make this call behave like the thread limit algorithm.
7453  retval = __kmp_avail_proc - __kmp_nth +
7454  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7455  if (retval > set_nproc) {
7456  retval = set_nproc;
7457  }
7458  if (retval < KMP_MIN_NTH) {
7459  retval = KMP_MIN_NTH;
7460  }
7461 
7462  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7463  retval));
7464  return retval;
7465  }
7466 
7467  // There is a slight delay in the load balance algorithm in detecting new
7468  // running procs. The real system load at this instant should be at least as
7469  // large as the #active omp thread that are available to add to the team.
7470  if (system_active < team_curr_active) {
7471  system_active = team_curr_active;
7472  }
7473  retval = __kmp_avail_proc - system_active + team_curr_active;
7474  if (retval > set_nproc) {
7475  retval = set_nproc;
7476  }
7477  if (retval < KMP_MIN_NTH) {
7478  retval = KMP_MIN_NTH;
7479  }
7480 
7481  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7482  return retval;
7483 } // __kmp_load_balance_nproc()
7484 
7485 #endif /* USE_LOAD_BALANCE */
7486 
7487 /* ------------------------------------------------------------------------ */
7488 
7489 /* NOTE: this is called with the __kmp_init_lock held */
7490 void __kmp_cleanup(void) {
7491  int f;
7492 
7493  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7494 
7495  if (TCR_4(__kmp_init_parallel)) {
7496 #if KMP_HANDLE_SIGNALS
7497  __kmp_remove_signals();
7498 #endif
7499  TCW_4(__kmp_init_parallel, FALSE);
7500  }
7501 
7502  if (TCR_4(__kmp_init_middle)) {
7503 #if KMP_AFFINITY_SUPPORTED
7504  __kmp_affinity_uninitialize();
7505 #endif /* KMP_AFFINITY_SUPPORTED */
7506  __kmp_cleanup_hierarchy();
7507  TCW_4(__kmp_init_middle, FALSE);
7508  }
7509 
7510  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7511 
7512  if (__kmp_init_serial) {
7513  __kmp_runtime_destroy();
7514  __kmp_init_serial = FALSE;
7515  }
7516 
7517  __kmp_cleanup_threadprivate_caches();
7518 
7519  for (f = 0; f < __kmp_threads_capacity; f++) {
7520  if (__kmp_root[f] != NULL) {
7521  __kmp_free(__kmp_root[f]);
7522  __kmp_root[f] = NULL;
7523  }
7524  }
7525  __kmp_free(__kmp_threads);
7526  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7527  // there is no need in freeing __kmp_root.
7528  __kmp_threads = NULL;
7529  __kmp_root = NULL;
7530  __kmp_threads_capacity = 0;
7531 
7532 #if KMP_USE_DYNAMIC_LOCK
7533  __kmp_cleanup_indirect_user_locks();
7534 #else
7535  __kmp_cleanup_user_locks();
7536 #endif
7537 
7538 #if KMP_AFFINITY_SUPPORTED
7539  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7540  __kmp_cpuinfo_file = NULL;
7541 #endif /* KMP_AFFINITY_SUPPORTED */
7542 
7543 #if KMP_USE_ADAPTIVE_LOCKS
7544 #if KMP_DEBUG_ADAPTIVE_LOCKS
7545  __kmp_print_speculative_stats();
7546 #endif
7547 #endif
7548  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7549  __kmp_nested_nth.nth = NULL;
7550  __kmp_nested_nth.size = 0;
7551  __kmp_nested_nth.used = 0;
7552  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7553  __kmp_nested_proc_bind.bind_types = NULL;
7554  __kmp_nested_proc_bind.size = 0;
7555  __kmp_nested_proc_bind.used = 0;
7556  if (__kmp_affinity_format) {
7557  KMP_INTERNAL_FREE(__kmp_affinity_format);
7558  __kmp_affinity_format = NULL;
7559  }
7560 
7561  __kmp_i18n_catclose();
7562 
7563 #if KMP_USE_HIER_SCHED
7564  __kmp_hier_scheds.deallocate();
7565 #endif
7566 
7567 #if KMP_STATS_ENABLED
7568  __kmp_stats_fini();
7569 #endif
7570 
7571  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7572 }
7573 
7574 /* ------------------------------------------------------------------------ */
7575 
7576 int __kmp_ignore_mppbeg(void) {
7577  char *env;
7578 
7579  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7580  if (__kmp_str_match_false(env))
7581  return FALSE;
7582  }
7583  // By default __kmpc_begin() is no-op.
7584  return TRUE;
7585 }
7586 
7587 int __kmp_ignore_mppend(void) {
7588  char *env;
7589 
7590  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7591  if (__kmp_str_match_false(env))
7592  return FALSE;
7593  }
7594  // By default __kmpc_end() is no-op.
7595  return TRUE;
7596 }
7597 
7598 void __kmp_internal_begin(void) {
7599  int gtid;
7600  kmp_root_t *root;
7601 
7602  /* this is a very important step as it will register new sibling threads
7603  and assign these new uber threads a new gtid */
7604  gtid = __kmp_entry_gtid();
7605  root = __kmp_threads[gtid]->th.th_root;
7606  KMP_ASSERT(KMP_UBER_GTID(gtid));
7607 
7608  if (root->r.r_begin)
7609  return;
7610  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7611  if (root->r.r_begin) {
7612  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7613  return;
7614  }
7615 
7616  root->r.r_begin = TRUE;
7617 
7618  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7619 }
7620 
7621 /* ------------------------------------------------------------------------ */
7622 
7623 void __kmp_user_set_library(enum library_type arg) {
7624  int gtid;
7625  kmp_root_t *root;
7626  kmp_info_t *thread;
7627 
7628  /* first, make sure we are initialized so we can get our gtid */
7629 
7630  gtid = __kmp_entry_gtid();
7631  thread = __kmp_threads[gtid];
7632 
7633  root = thread->th.th_root;
7634 
7635  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7636  library_serial));
7637  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7638  thread */
7639  KMP_WARNING(SetLibraryIncorrectCall);
7640  return;
7641  }
7642 
7643  switch (arg) {
7644  case library_serial:
7645  thread->th.th_set_nproc = 0;
7646  set__nproc(thread, 1);
7647  break;
7648  case library_turnaround:
7649  thread->th.th_set_nproc = 0;
7650  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7651  : __kmp_dflt_team_nth_ub);
7652  break;
7653  case library_throughput:
7654  thread->th.th_set_nproc = 0;
7655  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7656  : __kmp_dflt_team_nth_ub);
7657  break;
7658  default:
7659  KMP_FATAL(UnknownLibraryType, arg);
7660  }
7661 
7662  __kmp_aux_set_library(arg);
7663 }
7664 
7665 void __kmp_aux_set_stacksize(size_t arg) {
7666  if (!__kmp_init_serial)
7667  __kmp_serial_initialize();
7668 
7669 #if KMP_OS_DARWIN
7670  if (arg & (0x1000 - 1)) {
7671  arg &= ~(0x1000 - 1);
7672  if (arg + 0x1000) /* check for overflow if we round up */
7673  arg += 0x1000;
7674  }
7675 #endif
7676  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7677 
7678  /* only change the default stacksize before the first parallel region */
7679  if (!TCR_4(__kmp_init_parallel)) {
7680  size_t value = arg; /* argument is in bytes */
7681 
7682  if (value < __kmp_sys_min_stksize)
7683  value = __kmp_sys_min_stksize;
7684  else if (value > KMP_MAX_STKSIZE)
7685  value = KMP_MAX_STKSIZE;
7686 
7687  __kmp_stksize = value;
7688 
7689  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7690  }
7691 
7692  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7693 }
7694 
7695 /* set the behaviour of the runtime library */
7696 /* TODO this can cause some odd behaviour with sibling parallelism... */
7697 void __kmp_aux_set_library(enum library_type arg) {
7698  __kmp_library = arg;
7699 
7700  switch (__kmp_library) {
7701  case library_serial: {
7702  KMP_INFORM(LibraryIsSerial);
7703  } break;
7704  case library_turnaround:
7705  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7706  __kmp_use_yield = 2; // only yield when oversubscribed
7707  break;
7708  case library_throughput:
7709  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7710  __kmp_dflt_blocktime = 200;
7711  break;
7712  default:
7713  KMP_FATAL(UnknownLibraryType, arg);
7714  }
7715 }
7716 
7717 /* Getting team information common for all team API */
7718 // Returns NULL if not in teams construct
7719 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7720  kmp_info_t *thr = __kmp_entry_thread();
7721  teams_serialized = 0;
7722  if (thr->th.th_teams_microtask) {
7723  kmp_team_t *team = thr->th.th_team;
7724  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7725  int ii = team->t.t_level;
7726  teams_serialized = team->t.t_serialized;
7727  int level = tlevel + 1;
7728  KMP_DEBUG_ASSERT(ii >= tlevel);
7729  while (ii > level) {
7730  for (teams_serialized = team->t.t_serialized;
7731  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7732  }
7733  if (team->t.t_serialized && (!teams_serialized)) {
7734  team = team->t.t_parent;
7735  continue;
7736  }
7737  if (ii > level) {
7738  team = team->t.t_parent;
7739  ii--;
7740  }
7741  }
7742  return team;
7743  }
7744  return NULL;
7745 }
7746 
7747 int __kmp_aux_get_team_num() {
7748  int serialized;
7749  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7750  if (team) {
7751  if (serialized > 1) {
7752  return 0; // teams region is serialized ( 1 team of 1 thread ).
7753  } else {
7754  return team->t.t_master_tid;
7755  }
7756  }
7757  return 0;
7758 }
7759 
7760 int __kmp_aux_get_num_teams() {
7761  int serialized;
7762  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7763  if (team) {
7764  if (serialized > 1) {
7765  return 1;
7766  } else {
7767  return team->t.t_parent->t.t_nproc;
7768  }
7769  }
7770  return 1;
7771 }
7772 
7773 /* ------------------------------------------------------------------------ */
7774 
7775 /*
7776  * Affinity Format Parser
7777  *
7778  * Field is in form of: %[[[0].]size]type
7779  * % and type are required (%% means print a literal '%')
7780  * type is either single char or long name surrounded by {},
7781  * e.g., N or {num_threads}
7782  * 0 => leading zeros
7783  * . => right justified when size is specified
7784  * by default output is left justified
7785  * size is the *minimum* field length
7786  * All other characters are printed as is
7787  *
7788  * Available field types:
7789  * L {thread_level} - omp_get_level()
7790  * n {thread_num} - omp_get_thread_num()
7791  * h {host} - name of host machine
7792  * P {process_id} - process id (integer)
7793  * T {thread_identifier} - native thread identifier (integer)
7794  * N {num_threads} - omp_get_num_threads()
7795  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7796  * a {thread_affinity} - comma separated list of integers or integer ranges
7797  * (values of affinity mask)
7798  *
7799  * Implementation-specific field types can be added
7800  * If a type is unknown, print "undefined"
7801 */
7802 
7803 // Structure holding the short name, long name, and corresponding data type
7804 // for snprintf. A table of these will represent the entire valid keyword
7805 // field types.
7806 typedef struct kmp_affinity_format_field_t {
7807  char short_name; // from spec e.g., L -> thread level
7808  const char *long_name; // from spec thread_level -> thread level
7809  char field_format; // data type for snprintf (typically 'd' or 's'
7810  // for integer or string)
7811 } kmp_affinity_format_field_t;
7812 
7813 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7814 #if KMP_AFFINITY_SUPPORTED
7815  {'A', "thread_affinity", 's'},
7816 #endif
7817  {'t', "team_num", 'd'},
7818  {'T', "num_teams", 'd'},
7819  {'L', "nesting_level", 'd'},
7820  {'n', "thread_num", 'd'},
7821  {'N', "num_threads", 'd'},
7822  {'a', "ancestor_tnum", 'd'},
7823  {'H', "host", 's'},
7824  {'P', "process_id", 'd'},
7825  {'i', "native_thread_id", 'd'}};
7826 
7827 // Return the number of characters it takes to hold field
7828 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7829  const char **ptr,
7830  kmp_str_buf_t *field_buffer) {
7831  int rc, format_index, field_value;
7832  const char *width_left, *width_right;
7833  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7834  static const int FORMAT_SIZE = 20;
7835  char format[FORMAT_SIZE] = {0};
7836  char absolute_short_name = 0;
7837 
7838  KMP_DEBUG_ASSERT(gtid >= 0);
7839  KMP_DEBUG_ASSERT(th);
7840  KMP_DEBUG_ASSERT(**ptr == '%');
7841  KMP_DEBUG_ASSERT(field_buffer);
7842 
7843  __kmp_str_buf_clear(field_buffer);
7844 
7845  // Skip the initial %
7846  (*ptr)++;
7847 
7848  // Check for %% first
7849  if (**ptr == '%') {
7850  __kmp_str_buf_cat(field_buffer, "%", 1);
7851  (*ptr)++; // skip over the second %
7852  return 1;
7853  }
7854 
7855  // Parse field modifiers if they are present
7856  pad_zeros = false;
7857  if (**ptr == '0') {
7858  pad_zeros = true;
7859  (*ptr)++; // skip over 0
7860  }
7861  right_justify = false;
7862  if (**ptr == '.') {
7863  right_justify = true;
7864  (*ptr)++; // skip over .
7865  }
7866  // Parse width of field: [width_left, width_right)
7867  width_left = width_right = NULL;
7868  if (**ptr >= '0' && **ptr <= '9') {
7869  width_left = *ptr;
7870  SKIP_DIGITS(*ptr);
7871  width_right = *ptr;
7872  }
7873 
7874  // Create the format for KMP_SNPRINTF based on flags parsed above
7875  format_index = 0;
7876  format[format_index++] = '%';
7877  if (!right_justify)
7878  format[format_index++] = '-';
7879  if (pad_zeros)
7880  format[format_index++] = '0';
7881  if (width_left && width_right) {
7882  int i = 0;
7883  // Only allow 8 digit number widths.
7884  // This also prevents overflowing format variable
7885  while (i < 8 && width_left < width_right) {
7886  format[format_index++] = *width_left;
7887  width_left++;
7888  i++;
7889  }
7890  }
7891 
7892  // Parse a name (long or short)
7893  // Canonicalize the name into absolute_short_name
7894  found_valid_name = false;
7895  parse_long_name = (**ptr == '{');
7896  if (parse_long_name)
7897  (*ptr)++; // skip initial left brace
7898  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7899  sizeof(__kmp_affinity_format_table[0]);
7900  ++i) {
7901  char short_name = __kmp_affinity_format_table[i].short_name;
7902  const char *long_name = __kmp_affinity_format_table[i].long_name;
7903  char field_format = __kmp_affinity_format_table[i].field_format;
7904  if (parse_long_name) {
7905  int length = KMP_STRLEN(long_name);
7906  if (strncmp(*ptr, long_name, length) == 0) {
7907  found_valid_name = true;
7908  (*ptr) += length; // skip the long name
7909  }
7910  } else if (**ptr == short_name) {
7911  found_valid_name = true;
7912  (*ptr)++; // skip the short name
7913  }
7914  if (found_valid_name) {
7915  format[format_index++] = field_format;
7916  format[format_index++] = '\0';
7917  absolute_short_name = short_name;
7918  break;
7919  }
7920  }
7921  if (parse_long_name) {
7922  if (**ptr != '}') {
7923  absolute_short_name = 0;
7924  } else {
7925  (*ptr)++; // skip over the right brace
7926  }
7927  }
7928 
7929  // Attempt to fill the buffer with the requested
7930  // value using snprintf within __kmp_str_buf_print()
7931  switch (absolute_short_name) {
7932  case 't':
7933  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7934  break;
7935  case 'T':
7936  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7937  break;
7938  case 'L':
7939  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7940  break;
7941  case 'n':
7942  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7943  break;
7944  case 'H': {
7945  static const int BUFFER_SIZE = 256;
7946  char buf[BUFFER_SIZE];
7947  __kmp_expand_host_name(buf, BUFFER_SIZE);
7948  rc = __kmp_str_buf_print(field_buffer, format, buf);
7949  } break;
7950  case 'P':
7951  rc = __kmp_str_buf_print(field_buffer, format, getpid());
7952  break;
7953  case 'i':
7954  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7955  break;
7956  case 'N':
7957  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7958  break;
7959  case 'a':
7960  field_value =
7961  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7962  rc = __kmp_str_buf_print(field_buffer, format, field_value);
7963  break;
7964 #if KMP_AFFINITY_SUPPORTED
7965  case 'A': {
7966  kmp_str_buf_t buf;
7967  __kmp_str_buf_init(&buf);
7968  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
7969  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
7970  __kmp_str_buf_free(&buf);
7971  } break;
7972 #endif
7973  default:
7974  // According to spec, If an implementation does not have info for field
7975  // type, then "undefined" is printed
7976  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
7977  // Skip the field
7978  if (parse_long_name) {
7979  SKIP_TOKEN(*ptr);
7980  if (**ptr == '}')
7981  (*ptr)++;
7982  } else {
7983  (*ptr)++;
7984  }
7985  }
7986 
7987  KMP_ASSERT(format_index <= FORMAT_SIZE);
7988  return rc;
7989 }
7990 
7991 /*
7992  * Return number of characters needed to hold the affinity string
7993  * (not including null byte character)
7994  * The resultant string is printed to buffer, which the caller can then
7995  * handle afterwards
7996 */
7997 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
7998  kmp_str_buf_t *buffer) {
7999  const char *parse_ptr;
8000  size_t retval;
8001  const kmp_info_t *th;
8002  kmp_str_buf_t field;
8003 
8004  KMP_DEBUG_ASSERT(buffer);
8005  KMP_DEBUG_ASSERT(gtid >= 0);
8006 
8007  __kmp_str_buf_init(&field);
8008  __kmp_str_buf_clear(buffer);
8009 
8010  th = __kmp_threads[gtid];
8011  retval = 0;
8012 
8013  // If format is NULL or zero-length string, then we use
8014  // affinity-format-var ICV
8015  parse_ptr = format;
8016  if (parse_ptr == NULL || *parse_ptr == '\0') {
8017  parse_ptr = __kmp_affinity_format;
8018  }
8019  KMP_DEBUG_ASSERT(parse_ptr);
8020 
8021  while (*parse_ptr != '\0') {
8022  // Parse a field
8023  if (*parse_ptr == '%') {
8024  // Put field in the buffer
8025  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8026  __kmp_str_buf_catbuf(buffer, &field);
8027  retval += rc;
8028  } else {
8029  // Put literal character in buffer
8030  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8031  retval++;
8032  parse_ptr++;
8033  }
8034  }
8035  __kmp_str_buf_free(&field);
8036  return retval;
8037 }
8038 
8039 // Displays the affinity string to stdout
8040 void __kmp_aux_display_affinity(int gtid, const char *format) {
8041  kmp_str_buf_t buf;
8042  __kmp_str_buf_init(&buf);
8043  __kmp_aux_capture_affinity(gtid, format, &buf);
8044  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8045  __kmp_str_buf_free(&buf);
8046 }
8047 
8048 /* ------------------------------------------------------------------------ */
8049 
8050 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8051  int blocktime = arg; /* argument is in milliseconds */
8052 #if KMP_USE_MONITOR
8053  int bt_intervals;
8054 #endif
8055  int bt_set;
8056 
8057  __kmp_save_internal_controls(thread);
8058 
8059  /* Normalize and set blocktime for the teams */
8060  if (blocktime < KMP_MIN_BLOCKTIME)
8061  blocktime = KMP_MIN_BLOCKTIME;
8062  else if (blocktime > KMP_MAX_BLOCKTIME)
8063  blocktime = KMP_MAX_BLOCKTIME;
8064 
8065  set__blocktime_team(thread->th.th_team, tid, blocktime);
8066  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8067 
8068 #if KMP_USE_MONITOR
8069  /* Calculate and set blocktime intervals for the teams */
8070  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8071 
8072  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8073  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8074 #endif
8075 
8076  /* Set whether blocktime has been set to "TRUE" */
8077  bt_set = TRUE;
8078 
8079  set__bt_set_team(thread->th.th_team, tid, bt_set);
8080  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8081 #if KMP_USE_MONITOR
8082  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8083  "bt_intervals=%d, monitor_updates=%d\n",
8084  __kmp_gtid_from_tid(tid, thread->th.th_team),
8085  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8086  __kmp_monitor_wakeups));
8087 #else
8088  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8089  __kmp_gtid_from_tid(tid, thread->th.th_team),
8090  thread->th.th_team->t.t_id, tid, blocktime));
8091 #endif
8092 }
8093 
8094 void __kmp_aux_set_defaults(char const *str, int len) {
8095  if (!__kmp_init_serial) {
8096  __kmp_serial_initialize();
8097  }
8098  __kmp_env_initialize(str);
8099 
8100  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8101  __kmp_env_print();
8102  }
8103 } // __kmp_aux_set_defaults
8104 
8105 /* ------------------------------------------------------------------------ */
8106 /* internal fast reduction routines */
8107 
8108 PACKED_REDUCTION_METHOD_T
8109 __kmp_determine_reduction_method(
8110  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8111  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8112  kmp_critical_name *lck) {
8113 
8114  // Default reduction method: critical construct ( lck != NULL, like in current
8115  // PAROPT )
8116  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8117  // can be selected by RTL
8118  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8119  // can be selected by RTL
8120  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8121  // among generated by PAROPT.
8122 
8123  PACKED_REDUCTION_METHOD_T retval;
8124 
8125  int team_size;
8126 
8127  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8128  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8129 
8130 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8131  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8132 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8133 
8134  retval = critical_reduce_block;
8135 
8136  // another choice of getting a team size (with 1 dynamic deference) is slower
8137  team_size = __kmp_get_team_num_threads(global_tid);
8138  if (team_size == 1) {
8139 
8140  retval = empty_reduce_block;
8141 
8142  } else {
8143 
8144  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8145 
8146 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8147  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8148 
8149 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8150  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8151 
8152  int teamsize_cutoff = 4;
8153 
8154 #if KMP_MIC_SUPPORTED
8155  if (__kmp_mic_type != non_mic) {
8156  teamsize_cutoff = 8;
8157  }
8158 #endif
8159  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8160  if (tree_available) {
8161  if (team_size <= teamsize_cutoff) {
8162  if (atomic_available) {
8163  retval = atomic_reduce_block;
8164  }
8165  } else {
8166  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8167  }
8168  } else if (atomic_available) {
8169  retval = atomic_reduce_block;
8170  }
8171 #else
8172 #error "Unknown or unsupported OS"
8173 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8174  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8175 
8176 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8177 
8178 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8179 
8180  // basic tuning
8181 
8182  if (atomic_available) {
8183  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8184  retval = atomic_reduce_block;
8185  }
8186  } // otherwise: use critical section
8187 
8188 #elif KMP_OS_DARWIN
8189 
8190  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8191  if (atomic_available && (num_vars <= 3)) {
8192  retval = atomic_reduce_block;
8193  } else if (tree_available) {
8194  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8195  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8196  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8197  }
8198  } // otherwise: use critical section
8199 
8200 #else
8201 #error "Unknown or unsupported OS"
8202 #endif
8203 
8204 #else
8205 #error "Unknown or unsupported architecture"
8206 #endif
8207  }
8208 
8209  // KMP_FORCE_REDUCTION
8210 
8211  // If the team is serialized (team_size == 1), ignore the forced reduction
8212  // method and stay with the unsynchronized method (empty_reduce_block)
8213  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8214  team_size != 1) {
8215 
8216  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8217 
8218  int atomic_available, tree_available;
8219 
8220  switch ((forced_retval = __kmp_force_reduction_method)) {
8221  case critical_reduce_block:
8222  KMP_ASSERT(lck); // lck should be != 0
8223  break;
8224 
8225  case atomic_reduce_block:
8226  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8227  if (!atomic_available) {
8228  KMP_WARNING(RedMethodNotSupported, "atomic");
8229  forced_retval = critical_reduce_block;
8230  }
8231  break;
8232 
8233  case tree_reduce_block:
8234  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8235  if (!tree_available) {
8236  KMP_WARNING(RedMethodNotSupported, "tree");
8237  forced_retval = critical_reduce_block;
8238  } else {
8239 #if KMP_FAST_REDUCTION_BARRIER
8240  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8241 #endif
8242  }
8243  break;
8244 
8245  default:
8246  KMP_ASSERT(0); // "unsupported method specified"
8247  }
8248 
8249  retval = forced_retval;
8250  }
8251 
8252  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8253 
8254 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8255 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8256 
8257  return (retval);
8258 }
8259 // this function is for testing set/get/determine reduce method
8260 kmp_int32 __kmp_get_reduce_method(void) {
8261  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8262 }
8263 
8264 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8265 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8266 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8267 
8268 // Hard pause shuts down the runtime completely. Resume happens naturally when
8269 // OpenMP is used subsequently.
8270 void __kmp_hard_pause() {
8271  __kmp_pause_status = kmp_hard_paused;
8272  __kmp_internal_end_thread(-1);
8273 }
8274 
8275 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8276 void __kmp_resume_if_soft_paused() {
8277  if (__kmp_pause_status == kmp_soft_paused) {
8278  __kmp_pause_status = kmp_not_paused;
8279 
8280  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8281  kmp_info_t *thread = __kmp_threads[gtid];
8282  if (thread) { // Wake it if sleeping
8283  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8284  if (fl.is_sleeping())
8285  fl.resume(gtid);
8286  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8287  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8288  } else { // thread holds the lock and may sleep soon
8289  do { // until either the thread sleeps, or we can get the lock
8290  if (fl.is_sleeping()) {
8291  fl.resume(gtid);
8292  break;
8293  } else if (__kmp_try_suspend_mx(thread)) {
8294  __kmp_unlock_suspend_mx(thread);
8295  break;
8296  }
8297  } while (1);
8298  }
8299  }
8300  }
8301  }
8302 }
8303 
8304 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8305 // TODO: add warning messages
8306 int __kmp_pause_resource(kmp_pause_status_t level) {
8307  if (level == kmp_not_paused) { // requesting resume
8308  if (__kmp_pause_status == kmp_not_paused) {
8309  // error message about runtime not being paused, so can't resume
8310  return 1;
8311  } else {
8312  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8313  __kmp_pause_status == kmp_hard_paused);
8314  __kmp_pause_status = kmp_not_paused;
8315  return 0;
8316  }
8317  } else if (level == kmp_soft_paused) { // requesting soft pause
8318  if (__kmp_pause_status != kmp_not_paused) {
8319  // error message about already being paused
8320  return 1;
8321  } else {
8322  __kmp_soft_pause();
8323  return 0;
8324  }
8325  } else if (level == kmp_hard_paused) { // requesting hard pause
8326  if (__kmp_pause_status != kmp_not_paused) {
8327  // error message about already being paused
8328  return 1;
8329  } else {
8330  __kmp_hard_pause();
8331  return 0;
8332  }
8333  } else {
8334  // error message about invalid level
8335  return 1;
8336  }
8337 }
8338 
8339 
8340 void __kmp_omp_display_env(int verbose) {
8341  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8342  if (__kmp_init_serial == 0)
8343  __kmp_do_serial_initialize();
8344  __kmp_display_env_impl(!verbose, verbose);
8345  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8346 }
stats_state_e
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
__kmpc_end_serialized_parallel
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:481
kmp_sch_guided_chunked
@ kmp_sch_guided_chunked
Definition: kmp.h:345
sched_type
sched_type
Definition: kmp.h:340
KMP_IDENT_AUTOPAR
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:192
KMP_COUNT_VALUE
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
ident
Definition: kmp.h:226
kmp_sch_auto
@ kmp_sch_auto
Definition: kmp.h:347
kmp_sch_static
@ kmp_sch_static
Definition: kmp.h:343
ident::flags
kmp_int32 flags
Definition: kmp.h:228
KMP_INIT_PARTITIONED_TIMERS
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:929
__kmpc_serialized_parallel
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:463