1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 } 409 410 void __kmp_warn(char const *format, ...) { 411 char buffer[MAX_MESSAGE]; 412 va_list ap; 413 414 if (__kmp_generate_warnings == kmp_warnings_off) { 415 return; 416 } 417 418 va_start(ap, format); 419 420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 422 __kmp_vprintf(kmp_err, buffer, ap); 423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 424 425 va_end(ap); 426 } 427 428 void __kmp_abort_process() { 429 // Later threads may stall here, but that's ok because abort() will kill them. 430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 431 432 if (__kmp_debug_buf) { 433 __kmp_dump_debug_buffer(); 434 } 435 436 if (KMP_OS_WINDOWS) { 437 // Let other threads know of abnormal termination and prevent deadlock 438 // if abort happened during library initialization or shutdown 439 __kmp_global.g.g_abort = SIGABRT; 440 441 /* On Windows* OS by default abort() causes pop-up error box, which stalls 442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 443 boxes. _set_abort_behavior() works well, but this function is not 444 available in VS7 (this is not problem for DLL, but it is a problem for 445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 446 help, at least in some versions of MS C RTL. 447 448 It seems following sequence is the only way to simulate abort() and 449 avoid pop-up error box. */ 450 raise(SIGABRT); 451 _exit(3); // Just in case, if signal ignored, exit anyway. 452 } else { 453 __kmp_unregister_library(); 454 abort(); 455 } 456 457 __kmp_infinite_loop(); 458 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 459 460 } // __kmp_abort_process 461 462 void __kmp_abort_thread(void) { 463 // TODO: Eliminate g_abort global variable and this function. 464 // In case of abort just call abort(), it will kill all the threads. 465 __kmp_infinite_loop(); 466 } // __kmp_abort_thread 467 468 /* Print out the storage map for the major kmp_info_t thread data structures 469 that are allocated together. */ 470 471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 473 gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 476 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 477 478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 479 sizeof(kmp_local_t), "th_%d.th_local", gtid); 480 481 __kmp_print_storage_map_gtid( 482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 486 &thr->th.th_bar[bs_plain_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 488 gtid); 489 490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 491 &thr->th.th_bar[bs_forkjoin_barrier + 1], 492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 493 gtid); 494 495 #if KMP_FAST_REDUCTION_BARRIER 496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 497 &thr->th.th_bar[bs_reduction_barrier + 1], 498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 499 gtid); 500 #endif // KMP_FAST_REDUCTION_BARRIER 501 } 502 503 /* Print out the storage map for the major kmp_team_t team data structures 504 that are allocated together. */ 505 506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 507 int team_id, int num_thr) { 508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 513 &team->t.t_bar[bs_last_barrier], 514 sizeof(kmp_balign_team_t) * bs_last_barrier, 515 "%s_%d.t_bar", header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 518 &team->t.t_bar[bs_plain_barrier + 1], 519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 520 header, team_id); 521 522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 523 &team->t.t_bar[bs_forkjoin_barrier + 1], 524 sizeof(kmp_balign_team_t), 525 "%s_%d.t_bar[forkjoin]", header, team_id); 526 527 #if KMP_FAST_REDUCTION_BARRIER 528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 529 &team->t.t_bar[bs_reduction_barrier + 1], 530 sizeof(kmp_balign_team_t), 531 "%s_%d.t_bar[reduction]", header, team_id); 532 #endif // KMP_FAST_REDUCTION_BARRIER 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 537 538 __kmp_print_storage_map_gtid( 539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 543 &team->t.t_disp_buffer[num_disp_buff], 544 sizeof(dispatch_shared_info_t) * num_disp_buff, 545 "%s_%d.t_disp_buffer", header, team_id); 546 } 547 548 static void __kmp_init_allocator() { 549 __kmp_init_memkind(); 550 __kmp_init_target_mem(); 551 } 552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 553 554 /* ------------------------------------------------------------------------ */ 555 556 #if KMP_DYNAMIC_LIB 557 #if KMP_OS_WINDOWS 558 559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 561 562 switch (fdwReason) { 563 564 case DLL_PROCESS_ATTACH: 565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 566 567 return TRUE; 568 569 case DLL_PROCESS_DETACH: 570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 571 572 // According to Windows* documentation for DllMain entry point: 573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 574 // lpReserved == NULL when FreeLibrary() is called, 575 // lpReserved != NULL when the process is terminated. 576 // When FreeLibrary() is called, worker threads remain alive. So the 577 // runtime's state is consistent and executing proper shutdown is OK. 578 // When the process is terminated, worker threads have exited or been 579 // forcefully terminated by the OS and only the shutdown thread remains. 580 // This can leave the runtime in an inconsistent state. 581 // Hence, only attempt proper cleanup when FreeLibrary() is called. 582 // Otherwise, rely on OS to reclaim resources. 583 if (lpReserved == NULL) 584 __kmp_internal_end_library(__kmp_gtid_get_specific()); 585 586 return TRUE; 587 588 case DLL_THREAD_ATTACH: 589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 590 591 /* if we want to register new siblings all the time here call 592 * __kmp_get_gtid(); */ 593 return TRUE; 594 595 case DLL_THREAD_DETACH: 596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 597 598 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 599 return TRUE; 600 } 601 602 return TRUE; 603 } 604 605 #endif /* KMP_OS_WINDOWS */ 606 #endif /* KMP_DYNAMIC_LIB */ 607 608 /* __kmp_parallel_deo -- Wait until it's our turn. */ 609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 610 int gtid = *gtid_ref; 611 #ifdef BUILD_PARALLEL_ORDERED 612 kmp_team_t *team = __kmp_team_from_gtid(gtid); 613 #endif /* BUILD_PARALLEL_ORDERED */ 614 615 if (__kmp_env_consistency_check) { 616 if (__kmp_threads[gtid]->th.th_root->r.r_active) 617 #if KMP_USE_DYNAMIC_LOCK 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 619 #else 620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 621 #endif 622 } 623 #ifdef BUILD_PARALLEL_ORDERED 624 if (!team->t.t_serialized) { 625 KMP_MB(); 626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 627 NULL); 628 KMP_MB(); 629 } 630 #endif /* BUILD_PARALLEL_ORDERED */ 631 } 632 633 /* __kmp_parallel_dxo -- Signal the next task. */ 634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 635 int gtid = *gtid_ref; 636 #ifdef BUILD_PARALLEL_ORDERED 637 int tid = __kmp_tid_from_gtid(gtid); 638 kmp_team_t *team = __kmp_team_from_gtid(gtid); 639 #endif /* BUILD_PARALLEL_ORDERED */ 640 641 if (__kmp_env_consistency_check) { 642 if (__kmp_threads[gtid]->th.th_root->r.r_active) 643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 644 } 645 #ifdef BUILD_PARALLEL_ORDERED 646 if (!team->t.t_serialized) { 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 /* use the tid of the next thread in this team */ 650 /* TODO replace with general release procedure */ 651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 652 653 KMP_MB(); /* Flush all pending memory write invalidates. */ 654 } 655 #endif /* BUILD_PARALLEL_ORDERED */ 656 } 657 658 /* ------------------------------------------------------------------------ */ 659 /* The BARRIER for a SINGLE process section is always explicit */ 660 661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 662 int status; 663 kmp_info_t *th; 664 kmp_team_t *team; 665 666 if (!TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 __kmp_resume_if_soft_paused(); 669 670 th = __kmp_threads[gtid]; 671 team = th->th.th_team; 672 status = 0; 673 674 th->th.th_ident = id_ref; 675 676 if (team->t.t_serialized) { 677 status = 1; 678 } else { 679 kmp_int32 old_this = th->th.th_local.this_construct; 680 681 ++th->th.th_local.this_construct; 682 /* try to set team count to thread count--success means thread got the 683 single block */ 684 /* TODO: Should this be acquire or release? */ 685 if (team->t.t_construct == old_this) { 686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 687 th->th.th_local.this_construct); 688 } 689 #if USE_ITT_BUILD 690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 692 team->t.t_active_level == 1) { 693 // Only report metadata by primary thread of active team at level 1 694 __kmp_itt_metadata_single(id_ref); 695 } 696 #endif /* USE_ITT_BUILD */ 697 } 698 699 if (__kmp_env_consistency_check) { 700 if (status && push_ws) { 701 __kmp_push_workshare(gtid, ct_psingle, id_ref); 702 } else { 703 __kmp_check_workshare(gtid, ct_psingle, id_ref); 704 } 705 } 706 #if USE_ITT_BUILD 707 if (status) { 708 __kmp_itt_single_start(gtid); 709 } 710 #endif /* USE_ITT_BUILD */ 711 return status; 712 } 713 714 void __kmp_exit_single(int gtid) { 715 #if USE_ITT_BUILD 716 __kmp_itt_single_end(gtid); 717 #endif /* USE_ITT_BUILD */ 718 if (__kmp_env_consistency_check) 719 __kmp_pop_workshare(gtid, ct_psingle, NULL); 720 } 721 722 /* determine if we can go parallel or must use a serialized parallel region and 723 * how many threads we can use 724 * set_nproc is the number of threads requested for the team 725 * returns 0 if we should serialize or only use one thread, 726 * otherwise the number of threads to use 727 * The forkjoin lock is held by the caller. */ 728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 729 int master_tid, int set_nthreads, 730 int enter_teams) { 731 int capacity; 732 int new_nthreads; 733 KMP_DEBUG_ASSERT(__kmp_init_serial); 734 KMP_DEBUG_ASSERT(root && parent_team); 735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 736 737 // If dyn-var is set, dynamically adjust the number of desired threads, 738 // according to the method specified by dynamic_mode. 739 new_nthreads = set_nthreads; 740 if (!get__dynamic_2(parent_team, master_tid)) { 741 ; 742 } 743 #ifdef USE_LOAD_BALANCE 744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 746 if (new_nthreads == 1) { 747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 748 "reservation to 1 thread\n", 749 master_tid)); 750 return 1; 751 } 752 if (new_nthreads < set_nthreads) { 753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 754 "reservation to %d threads\n", 755 master_tid, new_nthreads)); 756 } 757 } 758 #endif /* USE_LOAD_BALANCE */ 759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 760 new_nthreads = __kmp_avail_proc - __kmp_nth + 761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 762 if (new_nthreads <= 1) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 764 "reservation to 1 thread\n", 765 master_tid)); 766 return 1; 767 } 768 if (new_nthreads < set_nthreads) { 769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 770 "reservation to %d threads\n", 771 master_tid, new_nthreads)); 772 } else { 773 new_nthreads = set_nthreads; 774 } 775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 776 if (set_nthreads > 2) { 777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 778 new_nthreads = (new_nthreads % set_nthreads) + 1; 779 if (new_nthreads == 1) { 780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 781 "reservation to 1 thread\n", 782 master_tid)); 783 return 1; 784 } 785 if (new_nthreads < set_nthreads) { 786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 787 "reservation to %d threads\n", 788 master_tid, new_nthreads)); 789 } 790 } 791 } else { 792 KMP_ASSERT(0); 793 } 794 795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 796 if (__kmp_nth + new_nthreads - 797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 798 __kmp_max_nth) { 799 int tl_nthreads = __kmp_max_nth - __kmp_nth + 800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 801 if (tl_nthreads <= 0) { 802 tl_nthreads = 1; 803 } 804 805 // If dyn-var is false, emit a 1-time warning. 806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 807 __kmp_reserve_warn = 1; 808 __kmp_msg(kmp_ms_warning, 809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 811 } 812 if (tl_nthreads == 1) { 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 814 "reduced reservation to 1 thread\n", 815 master_tid)); 816 return 1; 817 } 818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 819 "reservation to %d threads\n", 820 master_tid, tl_nthreads)); 821 new_nthreads = tl_nthreads; 822 } 823 824 // Respect OMP_THREAD_LIMIT 825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 827 if (cg_nthreads + new_nthreads - 828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 829 max_cg_threads) { 830 int tl_nthreads = max_cg_threads - cg_nthreads + 831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 832 if (tl_nthreads <= 0) { 833 tl_nthreads = 1; 834 } 835 836 // If dyn-var is false, emit a 1-time warning. 837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 838 __kmp_reserve_warn = 1; 839 __kmp_msg(kmp_ms_warning, 840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 842 } 843 if (tl_nthreads == 1) { 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 845 "reduced reservation to 1 thread\n", 846 master_tid)); 847 return 1; 848 } 849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 850 "reservation to %d threads\n", 851 master_tid, tl_nthreads)); 852 new_nthreads = tl_nthreads; 853 } 854 855 // Check if the threads array is large enough, or needs expanding. 856 // See comment in __kmp_register_root() about the adjustment if 857 // __kmp_threads[0] == NULL. 858 capacity = __kmp_threads_capacity; 859 if (TCR_PTR(__kmp_threads[0]) == NULL) { 860 --capacity; 861 } 862 // If it is not for initializing the hidden helper team, we need to take 863 // __kmp_hidden_helper_threads_num out of the capacity because it is included 864 // in __kmp_threads_capacity. 865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 866 capacity -= __kmp_hidden_helper_threads_num; 867 } 868 if (__kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 capacity) { 871 // Expand the threads array. 872 int slotsRequired = __kmp_nth + new_nthreads - 873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 874 capacity; 875 int slotsAdded = __kmp_expand_threads(slotsRequired); 876 if (slotsAdded < slotsRequired) { 877 // The threads array was not expanded enough. 878 new_nthreads -= (slotsRequired - slotsAdded); 879 KMP_ASSERT(new_nthreads >= 1); 880 881 // If dyn-var is false, emit a 1-time warning. 882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 883 __kmp_reserve_warn = 1; 884 if (__kmp_tp_cached) { 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 889 } else { 890 __kmp_msg(kmp_ms_warning, 891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 893 } 894 } 895 } 896 } 897 898 #ifdef KMP_DEBUG 899 if (new_nthreads == 1) { 900 KC_TRACE(10, 901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 902 "dead roots and rechecking; requested %d threads\n", 903 __kmp_get_gtid(), set_nthreads)); 904 } else { 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 906 " %d threads\n", 907 __kmp_get_gtid(), new_nthreads, set_nthreads)); 908 } 909 #endif // KMP_DEBUG 910 return new_nthreads; 911 } 912 913 /* Allocate threads from the thread pool and assign them to the new team. We are 914 assured that there are enough threads available, because we checked on that 915 earlier within critical section forkjoin */ 916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 917 kmp_info_t *master_th, int master_gtid, 918 int fork_teams_workers) { 919 int i; 920 int use_hot_team; 921 922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 924 KMP_MB(); 925 926 /* first, let's setup the primary thread */ 927 master_th->th.th_info.ds.ds_tid = 0; 928 master_th->th.th_team = team; 929 master_th->th.th_team_nproc = team->t.t_nproc; 930 master_th->th.th_team_master = master_th; 931 master_th->th.th_team_serialized = FALSE; 932 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 933 934 /* make sure we are not the optimized hot team */ 935 #if KMP_NESTED_HOT_TEAMS 936 use_hot_team = 0; 937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 938 if (hot_teams) { // hot teams array is not allocated if 939 // KMP_HOT_TEAMS_MAX_LEVEL=0 940 int level = team->t.t_active_level - 1; // index in array of hot teams 941 if (master_th->th.th_teams_microtask) { // are we inside the teams? 942 if (master_th->th.th_teams_size.nteams > 1) { 943 ++level; // level was not increased in teams construct for 944 // team_of_masters 945 } 946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 947 master_th->th.th_teams_level == team->t.t_level) { 948 ++level; // level was not increased in teams construct for 949 // team_of_workers before the parallel 950 } // team->t.t_level will be increased inside parallel 951 } 952 if (level < __kmp_hot_teams_max_level) { 953 if (hot_teams[level].hot_team) { 954 // hot team has already been allocated for given level 955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 956 use_hot_team = 1; // the team is ready to use 957 } else { 958 use_hot_team = 0; // AC: threads are not allocated yet 959 hot_teams[level].hot_team = team; // remember new hot team 960 hot_teams[level].hot_team_nth = team->t.t_nproc; 961 } 962 } else { 963 use_hot_team = 0; 964 } 965 } 966 #else 967 use_hot_team = team == root->r.r_hot_team; 968 #endif 969 if (!use_hot_team) { 970 971 /* install the primary thread */ 972 team->t.t_threads[0] = master_th; 973 __kmp_initialize_info(master_th, team, 0, master_gtid); 974 975 /* now, install the worker threads */ 976 for (i = 1; i < team->t.t_nproc; i++) { 977 978 /* fork or reallocate a new thread and install it in team */ 979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 980 team->t.t_threads[i] = thr; 981 KMP_DEBUG_ASSERT(thr); 982 KMP_DEBUG_ASSERT(thr->th.th_team == team); 983 /* align team and thread arrived states */ 984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 985 "T#%d(%d:%d) join =%llu, plain=%llu\n", 986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 987 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 988 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 989 team->t.t_bar[bs_plain_barrier].b_arrived)); 990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 991 thr->th.th_teams_level = master_th->th.th_teams_level; 992 thr->th.th_teams_size = master_th->th.th_teams_size; 993 { // Initialize threads' barrier data. 994 int b; 995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 996 for (b = 0; b < bs_last_barrier; ++b) { 997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 999 #if USE_DEBUGGER 1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1001 #endif 1002 } 1003 } 1004 } 1005 1006 #if KMP_AFFINITY_SUPPORTED 1007 // Do not partition the places list for teams construct workers who 1008 // haven't actually been forked to do real work yet. This partitioning 1009 // will take place in the parallel region nested within the teams construct. 1010 if (!fork_teams_workers) { 1011 __kmp_partition_places(team); 1012 } 1013 #endif 1014 } 1015 1016 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1017 for (i = 0; i < team->t.t_nproc; i++) { 1018 kmp_info_t *thr = team->t.t_threads[i]; 1019 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1020 thr->th.th_prev_level != team->t.t_level) { 1021 team->t.t_display_affinity = 1; 1022 break; 1023 } 1024 } 1025 } 1026 1027 KMP_MB(); 1028 } 1029 1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1031 // Propagate any changes to the floating point control registers out to the team 1032 // We try to avoid unnecessary writes to the relevant cache line in the team 1033 // structure, so we don't make changes unless they are needed. 1034 inline static void propagateFPControl(kmp_team_t *team) { 1035 if (__kmp_inherit_fp_control) { 1036 kmp_int16 x87_fpu_control_word; 1037 kmp_uint32 mxcsr; 1038 1039 // Get primary thread's values of FPU control flags (both X87 and vector) 1040 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1041 __kmp_store_mxcsr(&mxcsr); 1042 mxcsr &= KMP_X86_MXCSR_MASK; 1043 1044 // There is no point looking at t_fp_control_saved here. 1045 // If it is TRUE, we still have to update the values if they are different 1046 // from those we now have. If it is FALSE we didn't save anything yet, but 1047 // our objective is the same. We have to ensure that the values in the team 1048 // are the same as those we have. 1049 // So, this code achieves what we need whether or not t_fp_control_saved is 1050 // true. By checking whether the value needs updating we avoid unnecessary 1051 // writes that would put the cache-line into a written state, causing all 1052 // threads in the team to have to read it again. 1053 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1054 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1055 // Although we don't use this value, other code in the runtime wants to know 1056 // whether it should restore them. So we must ensure it is correct. 1057 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1058 } else { 1059 // Similarly here. Don't write to this cache-line in the team structure 1060 // unless we have to. 1061 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1062 } 1063 } 1064 1065 // Do the opposite, setting the hardware registers to the updated values from 1066 // the team. 1067 inline static void updateHWFPControl(kmp_team_t *team) { 1068 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1069 // Only reset the fp control regs if they have been changed in the team. 1070 // the parallel region that we are exiting. 1071 kmp_int16 x87_fpu_control_word; 1072 kmp_uint32 mxcsr; 1073 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1074 __kmp_store_mxcsr(&mxcsr); 1075 mxcsr &= KMP_X86_MXCSR_MASK; 1076 1077 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1078 __kmp_clear_x87_fpu_status_word(); 1079 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1080 } 1081 1082 if (team->t.t_mxcsr != mxcsr) { 1083 __kmp_load_mxcsr(&team->t.t_mxcsr); 1084 } 1085 } 1086 } 1087 #else 1088 #define propagateFPControl(x) ((void)0) 1089 #define updateHWFPControl(x) ((void)0) 1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1091 1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1093 int realloc); // forward declaration 1094 1095 /* Run a parallel region that has been serialized, so runs only in a team of the 1096 single primary thread. */ 1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1098 kmp_info_t *this_thr; 1099 kmp_team_t *serial_team; 1100 1101 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1102 1103 /* Skip all this code for autopar serialized loops since it results in 1104 unacceptable overhead */ 1105 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1106 return; 1107 1108 if (!TCR_4(__kmp_init_parallel)) 1109 __kmp_parallel_initialize(); 1110 __kmp_resume_if_soft_paused(); 1111 1112 this_thr = __kmp_threads[global_tid]; 1113 serial_team = this_thr->th.th_serial_team; 1114 1115 /* utilize the serialized team held by this thread */ 1116 KMP_DEBUG_ASSERT(serial_team); 1117 KMP_MB(); 1118 1119 if (__kmp_tasking_mode != tskm_immediate_exec) { 1120 KMP_DEBUG_ASSERT( 1121 this_thr->th.th_task_team == 1122 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1123 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1124 NULL); 1125 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1126 "team %p, new task_team = NULL\n", 1127 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1128 this_thr->th.th_task_team = NULL; 1129 } 1130 1131 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1132 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1133 proc_bind = proc_bind_false; 1134 } else if (proc_bind == proc_bind_default) { 1135 // No proc_bind clause was specified, so use the current value 1136 // of proc-bind-var for this parallel region. 1137 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1138 } 1139 // Reset for next parallel region 1140 this_thr->th.th_set_proc_bind = proc_bind_default; 1141 1142 #if OMPT_SUPPORT 1143 ompt_data_t ompt_parallel_data = ompt_data_none; 1144 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1145 if (ompt_enabled.enabled && 1146 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1147 1148 ompt_task_info_t *parent_task_info; 1149 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1150 1151 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1152 if (ompt_enabled.ompt_callback_parallel_begin) { 1153 int team_size = 1; 1154 1155 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1156 &(parent_task_info->task_data), &(parent_task_info->frame), 1157 &ompt_parallel_data, team_size, 1158 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1159 } 1160 } 1161 #endif // OMPT_SUPPORT 1162 1163 if (this_thr->th.th_team != serial_team) { 1164 // Nested level will be an index in the nested nthreads array 1165 int level = this_thr->th.th_team->t.t_level; 1166 1167 if (serial_team->t.t_serialized) { 1168 /* this serial team was already used 1169 TODO increase performance by making this locks more specific */ 1170 kmp_team_t *new_team; 1171 1172 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1173 1174 new_team = 1175 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1176 #if OMPT_SUPPORT 1177 ompt_parallel_data, 1178 #endif 1179 proc_bind, &this_thr->th.th_current_task->td_icvs, 1180 0 USE_NESTED_HOT_ARG(NULL)); 1181 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1182 KMP_ASSERT(new_team); 1183 1184 /* setup new serialized team and install it */ 1185 new_team->t.t_threads[0] = this_thr; 1186 new_team->t.t_parent = this_thr->th.th_team; 1187 serial_team = new_team; 1188 this_thr->th.th_serial_team = serial_team; 1189 1190 KF_TRACE( 1191 10, 1192 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1193 global_tid, serial_team)); 1194 1195 /* TODO the above breaks the requirement that if we run out of resources, 1196 then we can still guarantee that serialized teams are ok, since we may 1197 need to allocate a new one */ 1198 } else { 1199 KF_TRACE( 1200 10, 1201 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1202 global_tid, serial_team)); 1203 } 1204 1205 /* we have to initialize this serial team */ 1206 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1207 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1208 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1209 serial_team->t.t_ident = loc; 1210 serial_team->t.t_serialized = 1; 1211 serial_team->t.t_nproc = 1; 1212 serial_team->t.t_parent = this_thr->th.th_team; 1213 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1214 this_thr->th.th_team = serial_team; 1215 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1216 1217 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1218 this_thr->th.th_current_task)); 1219 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1220 this_thr->th.th_current_task->td_flags.executing = 0; 1221 1222 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1223 1224 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1225 implicit task for each serialized task represented by 1226 team->t.t_serialized? */ 1227 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1228 &this_thr->th.th_current_task->td_parent->td_icvs); 1229 1230 // Thread value exists in the nested nthreads array for the next nested 1231 // level 1232 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1233 this_thr->th.th_current_task->td_icvs.nproc = 1234 __kmp_nested_nth.nth[level + 1]; 1235 } 1236 1237 if (__kmp_nested_proc_bind.used && 1238 (level + 1 < __kmp_nested_proc_bind.used)) { 1239 this_thr->th.th_current_task->td_icvs.proc_bind = 1240 __kmp_nested_proc_bind.bind_types[level + 1]; 1241 } 1242 1243 #if USE_DEBUGGER 1244 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1245 #endif 1246 this_thr->th.th_info.ds.ds_tid = 0; 1247 1248 /* set thread cache values */ 1249 this_thr->th.th_team_nproc = 1; 1250 this_thr->th.th_team_master = this_thr; 1251 this_thr->th.th_team_serialized = 1; 1252 1253 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1254 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1255 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1256 1257 propagateFPControl(serial_team); 1258 1259 /* check if we need to allocate dispatch buffers stack */ 1260 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1261 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1262 serial_team->t.t_dispatch->th_disp_buffer = 1263 (dispatch_private_info_t *)__kmp_allocate( 1264 sizeof(dispatch_private_info_t)); 1265 } 1266 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1267 1268 KMP_MB(); 1269 1270 } else { 1271 /* this serialized team is already being used, 1272 * that's fine, just add another nested level */ 1273 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1274 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1275 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1276 ++serial_team->t.t_serialized; 1277 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1278 1279 // Nested level will be an index in the nested nthreads array 1280 int level = this_thr->th.th_team->t.t_level; 1281 // Thread value exists in the nested nthreads array for the next nested 1282 // level 1283 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1284 this_thr->th.th_current_task->td_icvs.nproc = 1285 __kmp_nested_nth.nth[level + 1]; 1286 } 1287 serial_team->t.t_level++; 1288 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1289 "of serial team %p to %d\n", 1290 global_tid, serial_team, serial_team->t.t_level)); 1291 1292 /* allocate/push dispatch buffers stack */ 1293 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1294 { 1295 dispatch_private_info_t *disp_buffer = 1296 (dispatch_private_info_t *)__kmp_allocate( 1297 sizeof(dispatch_private_info_t)); 1298 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1299 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1300 } 1301 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1302 1303 KMP_MB(); 1304 } 1305 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1306 1307 // Perform the display affinity functionality for 1308 // serialized parallel regions 1309 if (__kmp_display_affinity) { 1310 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1311 this_thr->th.th_prev_num_threads != 1) { 1312 // NULL means use the affinity-format-var ICV 1313 __kmp_aux_display_affinity(global_tid, NULL); 1314 this_thr->th.th_prev_level = serial_team->t.t_level; 1315 this_thr->th.th_prev_num_threads = 1; 1316 } 1317 } 1318 1319 if (__kmp_env_consistency_check) 1320 __kmp_push_parallel(global_tid, NULL); 1321 #if OMPT_SUPPORT 1322 serial_team->t.ompt_team_info.master_return_address = codeptr; 1323 if (ompt_enabled.enabled && 1324 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1325 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1326 OMPT_GET_FRAME_ADDRESS(0); 1327 1328 ompt_lw_taskteam_t lw_taskteam; 1329 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1330 &ompt_parallel_data, codeptr); 1331 1332 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1333 // don't use lw_taskteam after linking. content was swaped 1334 1335 /* OMPT implicit task begin */ 1336 if (ompt_enabled.ompt_callback_implicit_task) { 1337 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1338 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1339 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1340 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1341 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1342 __kmp_tid_from_gtid(global_tid); 1343 } 1344 1345 /* OMPT state */ 1346 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1347 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1348 OMPT_GET_FRAME_ADDRESS(0); 1349 } 1350 #endif 1351 } 1352 1353 /* most of the work for a fork */ 1354 /* return true if we really went parallel, false if serialized */ 1355 int __kmp_fork_call(ident_t *loc, int gtid, 1356 enum fork_context_e call_context, // Intel, GNU, ... 1357 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1358 kmp_va_list ap) { 1359 void **argv; 1360 int i; 1361 int master_tid; 1362 int master_this_cons; 1363 kmp_team_t *team; 1364 kmp_team_t *parent_team; 1365 kmp_info_t *master_th; 1366 kmp_root_t *root; 1367 int nthreads; 1368 int master_active; 1369 int master_set_numthreads; 1370 int level; 1371 int active_level; 1372 int teams_level; 1373 #if KMP_NESTED_HOT_TEAMS 1374 kmp_hot_team_ptr_t **p_hot_teams; 1375 #endif 1376 { // KMP_TIME_BLOCK 1377 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1378 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1379 1380 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1381 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1382 /* Some systems prefer the stack for the root thread(s) to start with */ 1383 /* some gap from the parent stack to prevent false sharing. */ 1384 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1385 /* These 2 lines below are so this does not get optimized out */ 1386 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1387 __kmp_stkpadding += (short)((kmp_int64)dummy); 1388 } 1389 1390 /* initialize if needed */ 1391 KMP_DEBUG_ASSERT( 1392 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1393 if (!TCR_4(__kmp_init_parallel)) 1394 __kmp_parallel_initialize(); 1395 __kmp_resume_if_soft_paused(); 1396 1397 /* setup current data */ 1398 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1399 // shutdown 1400 parent_team = master_th->th.th_team; 1401 master_tid = master_th->th.th_info.ds.ds_tid; 1402 master_this_cons = master_th->th.th_local.this_construct; 1403 root = master_th->th.th_root; 1404 master_active = root->r.r_active; 1405 master_set_numthreads = master_th->th.th_set_nproc; 1406 1407 #if OMPT_SUPPORT 1408 ompt_data_t ompt_parallel_data = ompt_data_none; 1409 ompt_data_t *parent_task_data; 1410 ompt_frame_t *ompt_frame; 1411 ompt_data_t *implicit_task_data; 1412 void *return_address = NULL; 1413 1414 if (ompt_enabled.enabled) { 1415 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1416 NULL, NULL); 1417 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1418 } 1419 #endif 1420 1421 // Assign affinity to root thread if it hasn't happened yet 1422 __kmp_assign_root_init_mask(); 1423 1424 // Nested level will be an index in the nested nthreads array 1425 level = parent_team->t.t_level; 1426 // used to launch non-serial teams even if nested is not allowed 1427 active_level = parent_team->t.t_active_level; 1428 // needed to check nesting inside the teams 1429 teams_level = master_th->th.th_teams_level; 1430 #if KMP_NESTED_HOT_TEAMS 1431 p_hot_teams = &master_th->th.th_hot_teams; 1432 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1433 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1434 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1435 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1436 // it is either actual or not needed (when active_level > 0) 1437 (*p_hot_teams)[0].hot_team_nth = 1; 1438 } 1439 #endif 1440 1441 #if OMPT_SUPPORT 1442 if (ompt_enabled.enabled) { 1443 if (ompt_enabled.ompt_callback_parallel_begin) { 1444 int team_size = master_set_numthreads 1445 ? master_set_numthreads 1446 : get__nproc_2(parent_team, master_tid); 1447 int flags = OMPT_INVOKER(call_context) | 1448 ((microtask == (microtask_t)__kmp_teams_master) 1449 ? ompt_parallel_league 1450 : ompt_parallel_team); 1451 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1452 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1453 return_address); 1454 } 1455 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1456 } 1457 #endif 1458 1459 master_th->th.th_ident = loc; 1460 1461 if (master_th->th.th_teams_microtask && ap && 1462 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1463 // AC: This is start of parallel that is nested inside teams construct. 1464 // The team is actual (hot), all workers are ready at the fork barrier. 1465 // No lock needed to initialize the team a bit, then free workers. 1466 parent_team->t.t_ident = loc; 1467 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1468 parent_team->t.t_argc = argc; 1469 argv = (void **)parent_team->t.t_argv; 1470 for (i = argc - 1; i >= 0; --i) 1471 *argv++ = va_arg(kmp_va_deref(ap), void *); 1472 // Increment our nested depth levels, but not increase the serialization 1473 if (parent_team == master_th->th.th_serial_team) { 1474 // AC: we are in serialized parallel 1475 __kmpc_serialized_parallel(loc, gtid); 1476 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1477 1478 if (call_context == fork_context_gnu) { 1479 // AC: need to decrement t_serialized for enquiry functions to work 1480 // correctly, will restore at join time 1481 parent_team->t.t_serialized--; 1482 return TRUE; 1483 } 1484 1485 #if OMPD_SUPPORT 1486 parent_team->t.t_pkfn = microtask; 1487 #endif 1488 1489 #if OMPT_SUPPORT 1490 void *dummy; 1491 void **exit_frame_p; 1492 1493 ompt_lw_taskteam_t lw_taskteam; 1494 1495 if (ompt_enabled.enabled) { 1496 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1497 &ompt_parallel_data, return_address); 1498 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1499 1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1501 // don't use lw_taskteam after linking. content was swaped 1502 1503 /* OMPT implicit task begin */ 1504 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1505 if (ompt_enabled.ompt_callback_implicit_task) { 1506 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1507 __kmp_tid_from_gtid(gtid); 1508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1509 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1510 implicit_task_data, 1, 1511 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1512 } 1513 1514 /* OMPT state */ 1515 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1516 } else { 1517 exit_frame_p = &dummy; 1518 } 1519 #endif 1520 // AC: need to decrement t_serialized for enquiry functions to work 1521 // correctly, will restore at join time 1522 parent_team->t.t_serialized--; 1523 1524 { 1525 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1526 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1527 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1528 #if OMPT_SUPPORT 1529 , 1530 exit_frame_p 1531 #endif 1532 ); 1533 } 1534 1535 #if OMPT_SUPPORT 1536 if (ompt_enabled.enabled) { 1537 *exit_frame_p = NULL; 1538 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1539 if (ompt_enabled.ompt_callback_implicit_task) { 1540 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1541 ompt_scope_end, NULL, implicit_task_data, 1, 1542 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1543 } 1544 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1545 __ompt_lw_taskteam_unlink(master_th); 1546 if (ompt_enabled.ompt_callback_parallel_end) { 1547 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1548 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1549 OMPT_INVOKER(call_context) | ompt_parallel_team, 1550 return_address); 1551 } 1552 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1553 } 1554 #endif 1555 return TRUE; 1556 } 1557 1558 parent_team->t.t_pkfn = microtask; 1559 parent_team->t.t_invoke = invoker; 1560 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1561 parent_team->t.t_active_level++; 1562 parent_team->t.t_level++; 1563 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1564 1565 #if OMPT_SUPPORT 1566 if (ompt_enabled.enabled) { 1567 ompt_lw_taskteam_t lw_taskteam; 1568 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1569 &ompt_parallel_data, return_address); 1570 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1571 } 1572 #endif 1573 1574 /* Change number of threads in the team if requested */ 1575 if (master_set_numthreads) { // The parallel has num_threads clause 1576 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1577 // AC: only can reduce number of threads dynamically, can't increase 1578 kmp_info_t **other_threads = parent_team->t.t_threads; 1579 // NOTE: if using distributed barrier, we need to run this code block 1580 // even when the team size appears not to have changed from the max. 1581 int old_proc = master_th->th.th_teams_size.nth; 1582 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 1583 bp_dist_bar) { 1584 __kmp_resize_dist_barrier(parent_team, old_proc, 1585 master_set_numthreads); 1586 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1587 } 1588 parent_team->t.t_nproc = master_set_numthreads; 1589 for (i = 0; i < master_set_numthreads; ++i) { 1590 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1591 } 1592 } 1593 // Keep extra threads hot in the team for possible next parallels 1594 master_th->th.th_set_nproc = 0; 1595 } 1596 1597 #if USE_DEBUGGER 1598 if (__kmp_debugging) { // Let debugger override number of threads. 1599 int nth = __kmp_omp_num_threads(loc); 1600 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1601 master_set_numthreads = nth; 1602 } 1603 } 1604 #endif 1605 1606 // Figure out the proc_bind policy for the nested parallel within teams 1607 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1608 // proc_bind_default means don't update 1609 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 1610 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1611 proc_bind = proc_bind_false; 1612 } else { 1613 // No proc_bind clause specified; use current proc-bind-var 1614 if (proc_bind == proc_bind_default) { 1615 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1616 } 1617 /* else: The proc_bind policy was specified explicitly on parallel 1618 clause. 1619 This overrides proc-bind-var for this parallel region, but does not 1620 change proc-bind-var. */ 1621 // Figure the value of proc-bind-var for the child threads. 1622 if ((level + 1 < __kmp_nested_proc_bind.used) && 1623 (__kmp_nested_proc_bind.bind_types[level + 1] != 1624 master_th->th.th_current_task->td_icvs.proc_bind)) { 1625 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1626 } 1627 } 1628 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); 1629 // Need to change the bind-var ICV to correct value for each implicit task 1630 if (proc_bind_icv != proc_bind_default && 1631 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { 1632 kmp_info_t **other_threads = parent_team->t.t_threads; 1633 for (i = 0; i < master_th->th.th_team_nproc; ++i) { 1634 other_threads[i]->th.th_current_task->td_icvs.proc_bind = 1635 proc_bind_icv; 1636 } 1637 } 1638 // Reset for next parallel region 1639 master_th->th.th_set_proc_bind = proc_bind_default; 1640 1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1642 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1643 KMP_ITT_DEBUG) && 1644 __kmp_forkjoin_frames_mode == 3 && 1645 parent_team->t.t_active_level == 1 // only report frames at level 1 1646 && master_th->th.th_teams_size.nteams == 1) { 1647 kmp_uint64 tmp_time = __itt_get_timestamp(); 1648 master_th->th.th_frame_time = tmp_time; 1649 parent_team->t.t_region_time = tmp_time; 1650 } 1651 if (__itt_stack_caller_create_ptr) { 1652 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1653 // create new stack stitching id before entering fork barrier 1654 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1655 } 1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1657 #if KMP_AFFINITY_SUPPORTED 1658 __kmp_partition_places(parent_team); 1659 #endif 1660 1661 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1662 "master_th=%p, gtid=%d\n", 1663 root, parent_team, master_th, gtid)); 1664 __kmp_internal_fork(loc, gtid, parent_team); 1665 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1666 "master_th=%p, gtid=%d\n", 1667 root, parent_team, master_th, gtid)); 1668 1669 if (call_context == fork_context_gnu) 1670 return TRUE; 1671 1672 /* Invoke microtask for PRIMARY thread */ 1673 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1674 parent_team->t.t_id, parent_team->t.t_pkfn)); 1675 1676 if (!parent_team->t.t_invoke(gtid)) { 1677 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1678 } 1679 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1680 parent_team->t.t_id, parent_team->t.t_pkfn)); 1681 KMP_MB(); /* Flush all pending memory write invalidates. */ 1682 1683 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1684 1685 return TRUE; 1686 } // Parallel closely nested in teams construct 1687 1688 #if KMP_DEBUG 1689 if (__kmp_tasking_mode != tskm_immediate_exec) { 1690 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1691 parent_team->t.t_task_team[master_th->th.th_task_state]); 1692 } 1693 #endif 1694 1695 // Need this to happen before we determine the number of threads, not while 1696 // we are allocating the team 1697 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1698 int enter_teams = 0; 1699 if (parent_team->t.t_active_level >= 1700 master_th->th.th_current_task->td_icvs.max_active_levels) { 1701 nthreads = 1; 1702 } else { 1703 enter_teams = ((ap == NULL && active_level == 0) || 1704 (ap && teams_level > 0 && teams_level == level)); 1705 nthreads = master_set_numthreads 1706 ? master_set_numthreads 1707 // TODO: get nproc directly from current task 1708 : get__nproc_2(parent_team, master_tid); 1709 // Check if we need to take forkjoin lock? (no need for serialized 1710 // parallel out of teams construct). This code moved here from 1711 // __kmp_reserve_threads() to speedup nested serialized parallels. 1712 if (nthreads > 1) { 1713 if ((get__max_active_levels(master_th) == 1 && 1714 (root->r.r_in_parallel && !enter_teams)) || 1715 (__kmp_library == library_serial)) { 1716 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1717 " threads\n", 1718 gtid, nthreads)); 1719 nthreads = 1; 1720 } 1721 } 1722 if (nthreads > 1) { 1723 /* determine how many new threads we can use */ 1724 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1725 /* AC: If we execute teams from parallel region (on host), then teams 1726 should be created but each can only have 1 thread if nesting is 1727 disabled. If teams called from serial region, then teams and their 1728 threads should be created regardless of the nesting setting. */ 1729 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1730 nthreads, enter_teams); 1731 if (nthreads == 1) { 1732 // Free lock for single thread execution here; for multi-thread 1733 // execution it will be freed later after team of threads created 1734 // and initialized 1735 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1736 } 1737 } 1738 } 1739 KMP_DEBUG_ASSERT(nthreads > 0); 1740 1741 // If we temporarily changed the set number of threads then restore it now 1742 master_th->th.th_set_nproc = 0; 1743 1744 /* create a serialized parallel region? */ 1745 if (nthreads == 1) { 1746 /* josh todo: hypothetical question: what do we do for OS X*? */ 1747 #if KMP_OS_LINUX && \ 1748 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1749 void *args[argc]; 1750 #else 1751 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1753 KMP_ARCH_AARCH64) */ 1754 1755 KA_TRACE(20, 1756 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1757 1758 __kmpc_serialized_parallel(loc, gtid); 1759 1760 #if OMPD_SUPPORT 1761 master_th->th.th_serial_team->t.t_pkfn = microtask; 1762 #endif 1763 1764 if (call_context == fork_context_intel) { 1765 /* TODO this sucks, use the compiler itself to pass args! :) */ 1766 master_th->th.th_serial_team->t.t_ident = loc; 1767 if (!ap) { 1768 // revert change made in __kmpc_serialized_parallel() 1769 master_th->th.th_serial_team->t.t_level--; 1770 // Get args from parent team for teams construct 1771 1772 #if OMPT_SUPPORT 1773 void *dummy; 1774 void **exit_frame_p; 1775 ompt_task_info_t *task_info; 1776 1777 ompt_lw_taskteam_t lw_taskteam; 1778 1779 if (ompt_enabled.enabled) { 1780 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1781 &ompt_parallel_data, return_address); 1782 1783 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1784 // don't use lw_taskteam after linking. content was swaped 1785 1786 task_info = OMPT_CUR_TASK_INFO(master_th); 1787 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1788 if (ompt_enabled.ompt_callback_implicit_task) { 1789 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1790 __kmp_tid_from_gtid(gtid); 1791 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1792 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1793 &(task_info->task_data), 1, 1794 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1795 ompt_task_implicit); 1796 } 1797 1798 /* OMPT state */ 1799 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1800 } else { 1801 exit_frame_p = &dummy; 1802 } 1803 #endif 1804 1805 { 1806 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1807 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1808 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1809 parent_team->t.t_argv 1810 #if OMPT_SUPPORT 1811 , 1812 exit_frame_p 1813 #endif 1814 ); 1815 } 1816 1817 #if OMPT_SUPPORT 1818 if (ompt_enabled.enabled) { 1819 *exit_frame_p = NULL; 1820 if (ompt_enabled.ompt_callback_implicit_task) { 1821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1822 ompt_scope_end, NULL, &(task_info->task_data), 1, 1823 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1824 ompt_task_implicit); 1825 } 1826 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1827 __ompt_lw_taskteam_unlink(master_th); 1828 if (ompt_enabled.ompt_callback_parallel_end) { 1829 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1830 &ompt_parallel_data, parent_task_data, 1831 OMPT_INVOKER(call_context) | ompt_parallel_team, 1832 return_address); 1833 } 1834 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1835 } 1836 #endif 1837 } else if (microtask == (microtask_t)__kmp_teams_master) { 1838 KMP_DEBUG_ASSERT(master_th->th.th_team == 1839 master_th->th.th_serial_team); 1840 team = master_th->th.th_team; 1841 // team->t.t_pkfn = microtask; 1842 team->t.t_invoke = invoker; 1843 __kmp_alloc_argv_entries(argc, team, TRUE); 1844 team->t.t_argc = argc; 1845 argv = (void **)team->t.t_argv; 1846 if (ap) { 1847 for (i = argc - 1; i >= 0; --i) 1848 *argv++ = va_arg(kmp_va_deref(ap), void *); 1849 } else { 1850 for (i = 0; i < argc; ++i) 1851 // Get args from parent team for teams construct 1852 argv[i] = parent_team->t.t_argv[i]; 1853 } 1854 // AC: revert change made in __kmpc_serialized_parallel() 1855 // because initial code in teams should have level=0 1856 team->t.t_level--; 1857 // AC: call special invoker for outer "parallel" of teams construct 1858 invoker(gtid); 1859 #if OMPT_SUPPORT 1860 if (ompt_enabled.enabled) { 1861 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1862 if (ompt_enabled.ompt_callback_implicit_task) { 1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1864 ompt_scope_end, NULL, &(task_info->task_data), 0, 1865 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1866 } 1867 if (ompt_enabled.ompt_callback_parallel_end) { 1868 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1869 &ompt_parallel_data, parent_task_data, 1870 OMPT_INVOKER(call_context) | ompt_parallel_league, 1871 return_address); 1872 } 1873 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1874 } 1875 #endif 1876 } else { 1877 argv = args; 1878 for (i = argc - 1; i >= 0; --i) 1879 *argv++ = va_arg(kmp_va_deref(ap), void *); 1880 KMP_MB(); 1881 1882 #if OMPT_SUPPORT 1883 void *dummy; 1884 void **exit_frame_p; 1885 ompt_task_info_t *task_info; 1886 1887 ompt_lw_taskteam_t lw_taskteam; 1888 1889 if (ompt_enabled.enabled) { 1890 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1891 &ompt_parallel_data, return_address); 1892 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1893 // don't use lw_taskteam after linking. content was swaped 1894 task_info = OMPT_CUR_TASK_INFO(master_th); 1895 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1896 1897 /* OMPT implicit task begin */ 1898 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1899 if (ompt_enabled.ompt_callback_implicit_task) { 1900 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1901 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1902 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1903 ompt_task_implicit); 1904 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1905 __kmp_tid_from_gtid(gtid); 1906 } 1907 1908 /* OMPT state */ 1909 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1910 } else { 1911 exit_frame_p = &dummy; 1912 } 1913 #endif 1914 1915 { 1916 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1917 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1918 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1919 #if OMPT_SUPPORT 1920 , 1921 exit_frame_p 1922 #endif 1923 ); 1924 } 1925 1926 #if OMPT_SUPPORT 1927 if (ompt_enabled.enabled) { 1928 *exit_frame_p = NULL; 1929 if (ompt_enabled.ompt_callback_implicit_task) { 1930 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1931 ompt_scope_end, NULL, &(task_info->task_data), 1, 1932 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1933 ompt_task_implicit); 1934 } 1935 1936 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1937 __ompt_lw_taskteam_unlink(master_th); 1938 if (ompt_enabled.ompt_callback_parallel_end) { 1939 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1940 &ompt_parallel_data, parent_task_data, 1941 OMPT_INVOKER(call_context) | ompt_parallel_team, 1942 return_address); 1943 } 1944 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1945 } 1946 #endif 1947 } 1948 } else if (call_context == fork_context_gnu) { 1949 #if OMPT_SUPPORT 1950 ompt_lw_taskteam_t lwt; 1951 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1952 return_address); 1953 1954 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1955 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1956 // don't use lw_taskteam after linking. content was swaped 1957 #endif 1958 1959 // we were called from GNU native code 1960 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1961 return FALSE; 1962 } else { 1963 KMP_ASSERT2(call_context < fork_context_last, 1964 "__kmp_fork_call: unknown fork_context parameter"); 1965 } 1966 1967 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1968 KMP_MB(); 1969 return FALSE; 1970 } // if (nthreads == 1) 1971 1972 // GEH: only modify the executing flag in the case when not serialized 1973 // serialized case is handled in kmpc_serialized_parallel 1974 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1975 "curtask=%p, curtask_max_aclevel=%d\n", 1976 parent_team->t.t_active_level, master_th, 1977 master_th->th.th_current_task, 1978 master_th->th.th_current_task->td_icvs.max_active_levels)); 1979 // TODO: GEH - cannot do this assertion because root thread not set up as 1980 // executing 1981 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1982 master_th->th.th_current_task->td_flags.executing = 0; 1983 1984 if (!master_th->th.th_teams_microtask || level > teams_level) { 1985 /* Increment our nested depth level */ 1986 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1987 } 1988 1989 // See if we need to make a copy of the ICVs. 1990 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1991 if ((level + 1 < __kmp_nested_nth.used) && 1992 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1993 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1994 } else { 1995 nthreads_icv = 0; // don't update 1996 } 1997 1998 // Figure out the proc_bind_policy for the new team. 1999 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2000 // proc_bind_default means don't update 2001 kmp_proc_bind_t proc_bind_icv = proc_bind_default; 2002 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2003 proc_bind = proc_bind_false; 2004 } else { 2005 // No proc_bind clause specified; use current proc-bind-var for this 2006 // parallel region 2007 if (proc_bind == proc_bind_default) { 2008 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2009 } 2010 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND 2011 if (master_th->th.th_teams_microtask && 2012 microtask == (microtask_t)__kmp_teams_master) { 2013 proc_bind = __kmp_teams_proc_bind; 2014 } 2015 /* else: The proc_bind policy was specified explicitly on parallel clause. 2016 This overrides proc-bind-var for this parallel region, but does not 2017 change proc-bind-var. */ 2018 // Figure the value of proc-bind-var for the child threads. 2019 if ((level + 1 < __kmp_nested_proc_bind.used) && 2020 (__kmp_nested_proc_bind.bind_types[level + 1] != 2021 master_th->th.th_current_task->td_icvs.proc_bind)) { 2022 // Do not modify the proc bind icv for the two teams construct forks 2023 // They just let the proc bind icv pass through 2024 if (!master_th->th.th_teams_microtask || 2025 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) 2026 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2027 } 2028 } 2029 2030 // Reset for next parallel region 2031 master_th->th.th_set_proc_bind = proc_bind_default; 2032 2033 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2034 kmp_internal_control_t new_icvs; 2035 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2036 new_icvs.next = NULL; 2037 if (nthreads_icv > 0) { 2038 new_icvs.nproc = nthreads_icv; 2039 } 2040 if (proc_bind_icv != proc_bind_default) { 2041 new_icvs.proc_bind = proc_bind_icv; 2042 } 2043 2044 /* allocate a new parallel team */ 2045 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2046 team = __kmp_allocate_team(root, nthreads, nthreads, 2047 #if OMPT_SUPPORT 2048 ompt_parallel_data, 2049 #endif 2050 proc_bind, &new_icvs, 2051 argc USE_NESTED_HOT_ARG(master_th)); 2052 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2053 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2054 } else { 2055 /* allocate a new parallel team */ 2056 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2057 team = __kmp_allocate_team(root, nthreads, nthreads, 2058 #if OMPT_SUPPORT 2059 ompt_parallel_data, 2060 #endif 2061 proc_bind, 2062 &master_th->th.th_current_task->td_icvs, 2063 argc USE_NESTED_HOT_ARG(master_th)); 2064 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2065 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2066 &master_th->th.th_current_task->td_icvs); 2067 } 2068 KF_TRACE( 2069 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2070 2071 /* setup the new team */ 2072 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2073 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2074 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2075 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2076 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2077 #if OMPT_SUPPORT 2078 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2079 return_address); 2080 #endif 2081 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2082 // TODO: parent_team->t.t_level == INT_MAX ??? 2083 if (!master_th->th.th_teams_microtask || level > teams_level) { 2084 int new_level = parent_team->t.t_level + 1; 2085 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2086 new_level = parent_team->t.t_active_level + 1; 2087 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2088 } else { 2089 // AC: Do not increase parallel level at start of the teams construct 2090 int new_level = parent_team->t.t_level; 2091 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2092 new_level = parent_team->t.t_active_level; 2093 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2094 } 2095 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2096 // set primary thread's schedule as new run-time schedule 2097 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2098 2099 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2100 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2101 2102 // Update the floating point rounding in the team if required. 2103 propagateFPControl(team); 2104 #if OMPD_SUPPORT 2105 if (ompd_state & OMPD_ENABLE_BP) 2106 ompd_bp_parallel_begin(); 2107 #endif 2108 2109 if (__kmp_tasking_mode != tskm_immediate_exec) { 2110 // Set primary thread's task team to team's task team. Unless this is hot 2111 // team, it should be NULL. 2112 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2113 parent_team->t.t_task_team[master_th->th.th_task_state]); 2114 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2115 "%p, new task_team %p / team %p\n", 2116 __kmp_gtid_from_thread(master_th), 2117 master_th->th.th_task_team, parent_team, 2118 team->t.t_task_team[master_th->th.th_task_state], team)); 2119 2120 if (active_level || master_th->th.th_task_team) { 2121 // Take a memo of primary thread's task_state 2122 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2123 if (master_th->th.th_task_state_top >= 2124 master_th->th.th_task_state_stack_sz) { // increase size 2125 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2126 kmp_uint8 *old_stack, *new_stack; 2127 kmp_uint32 i; 2128 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2129 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2130 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2131 } 2132 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2133 ++i) { // zero-init rest of stack 2134 new_stack[i] = 0; 2135 } 2136 old_stack = master_th->th.th_task_state_memo_stack; 2137 master_th->th.th_task_state_memo_stack = new_stack; 2138 master_th->th.th_task_state_stack_sz = new_size; 2139 __kmp_free(old_stack); 2140 } 2141 // Store primary thread's task_state on stack 2142 master_th->th 2143 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2144 master_th->th.th_task_state; 2145 master_th->th.th_task_state_top++; 2146 #if KMP_NESTED_HOT_TEAMS 2147 if (master_th->th.th_hot_teams && 2148 active_level < __kmp_hot_teams_max_level && 2149 team == master_th->th.th_hot_teams[active_level].hot_team) { 2150 // Restore primary thread's nested state if nested hot team 2151 master_th->th.th_task_state = 2152 master_th->th 2153 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2154 } else { 2155 #endif 2156 master_th->th.th_task_state = 0; 2157 #if KMP_NESTED_HOT_TEAMS 2158 } 2159 #endif 2160 } 2161 #if !KMP_NESTED_HOT_TEAMS 2162 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2163 (team == root->r.r_hot_team)); 2164 #endif 2165 } 2166 2167 KA_TRACE( 2168 20, 2169 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2170 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2171 team->t.t_nproc)); 2172 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2173 (team->t.t_master_tid == 0 && 2174 (team->t.t_parent == root->r.r_root_team || 2175 team->t.t_parent->t.t_serialized))); 2176 KMP_MB(); 2177 2178 /* now, setup the arguments */ 2179 argv = (void **)team->t.t_argv; 2180 if (ap) { 2181 for (i = argc - 1; i >= 0; --i) { 2182 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2183 KMP_CHECK_UPDATE(*argv, new_argv); 2184 argv++; 2185 } 2186 } else { 2187 for (i = 0; i < argc; ++i) { 2188 // Get args from parent team for teams construct 2189 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2190 } 2191 } 2192 2193 /* now actually fork the threads */ 2194 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2195 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2196 root->r.r_active = TRUE; 2197 2198 __kmp_fork_team_threads(root, team, master_th, gtid, !ap); 2199 __kmp_setup_icv_copy(team, nthreads, 2200 &master_th->th.th_current_task->td_icvs, loc); 2201 2202 #if OMPT_SUPPORT 2203 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2204 #endif 2205 2206 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2207 2208 #if USE_ITT_BUILD 2209 if (team->t.t_active_level == 1 // only report frames at level 1 2210 && !master_th->th.th_teams_microtask) { // not in teams construct 2211 #if USE_ITT_NOTIFY 2212 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2213 (__kmp_forkjoin_frames_mode == 3 || 2214 __kmp_forkjoin_frames_mode == 1)) { 2215 kmp_uint64 tmp_time = 0; 2216 if (__itt_get_timestamp_ptr) 2217 tmp_time = __itt_get_timestamp(); 2218 // Internal fork - report frame begin 2219 master_th->th.th_frame_time = tmp_time; 2220 if (__kmp_forkjoin_frames_mode == 3) 2221 team->t.t_region_time = tmp_time; 2222 } else 2223 // only one notification scheme (either "submit" or "forking/joined", not both) 2224 #endif /* USE_ITT_NOTIFY */ 2225 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2226 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2227 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2228 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2229 } 2230 } 2231 #endif /* USE_ITT_BUILD */ 2232 2233 /* now go on and do the work */ 2234 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2235 KMP_MB(); 2236 KF_TRACE(10, 2237 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2238 root, team, master_th, gtid)); 2239 2240 #if USE_ITT_BUILD 2241 if (__itt_stack_caller_create_ptr) { 2242 // create new stack stitching id before entering fork barrier 2243 if (!enter_teams) { 2244 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2245 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2246 } else if (parent_team->t.t_serialized) { 2247 // keep stack stitching id in the serialized parent_team; 2248 // current team will be used for parallel inside the teams; 2249 // if parent_team is active, then it already keeps stack stitching id 2250 // for the league of teams 2251 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2252 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2253 } 2254 } 2255 #endif /* USE_ITT_BUILD */ 2256 2257 // AC: skip __kmp_internal_fork at teams construct, let only primary 2258 // threads execute 2259 if (ap) { 2260 __kmp_internal_fork(loc, gtid, team); 2261 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2262 "master_th=%p, gtid=%d\n", 2263 root, team, master_th, gtid)); 2264 } 2265 2266 if (call_context == fork_context_gnu) { 2267 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2268 return TRUE; 2269 } 2270 2271 /* Invoke microtask for PRIMARY thread */ 2272 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2273 team->t.t_id, team->t.t_pkfn)); 2274 } // END of timer KMP_fork_call block 2275 2276 #if KMP_STATS_ENABLED 2277 // If beginning a teams construct, then change thread state 2278 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2279 if (!ap) { 2280 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2281 } 2282 #endif 2283 2284 if (!team->t.t_invoke(gtid)) { 2285 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2286 } 2287 2288 #if KMP_STATS_ENABLED 2289 // If was beginning of a teams construct, then reset thread state 2290 if (!ap) { 2291 KMP_SET_THREAD_STATE(previous_state); 2292 } 2293 #endif 2294 2295 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2296 team->t.t_id, team->t.t_pkfn)); 2297 KMP_MB(); /* Flush all pending memory write invalidates. */ 2298 2299 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2300 #if OMPT_SUPPORT 2301 if (ompt_enabled.enabled) { 2302 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2303 } 2304 #endif 2305 2306 return TRUE; 2307 } 2308 2309 #if OMPT_SUPPORT 2310 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2311 kmp_team_t *team) { 2312 // restore state outside the region 2313 thread->th.ompt_thread_info.state = 2314 ((team->t.t_serialized) ? ompt_state_work_serial 2315 : ompt_state_work_parallel); 2316 } 2317 2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2319 kmp_team_t *team, ompt_data_t *parallel_data, 2320 int flags, void *codeptr) { 2321 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2322 if (ompt_enabled.ompt_callback_parallel_end) { 2323 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2324 parallel_data, &(task_info->task_data), flags, codeptr); 2325 } 2326 2327 task_info->frame.enter_frame = ompt_data_none; 2328 __kmp_join_restore_state(thread, team); 2329 } 2330 #endif 2331 2332 void __kmp_join_call(ident_t *loc, int gtid 2333 #if OMPT_SUPPORT 2334 , 2335 enum fork_context_e fork_context 2336 #endif 2337 , 2338 int exit_teams) { 2339 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2340 kmp_team_t *team; 2341 kmp_team_t *parent_team; 2342 kmp_info_t *master_th; 2343 kmp_root_t *root; 2344 int master_active; 2345 2346 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2347 2348 /* setup current data */ 2349 master_th = __kmp_threads[gtid]; 2350 root = master_th->th.th_root; 2351 team = master_th->th.th_team; 2352 parent_team = team->t.t_parent; 2353 2354 master_th->th.th_ident = loc; 2355 2356 #if OMPT_SUPPORT 2357 void *team_microtask = (void *)team->t.t_pkfn; 2358 // For GOMP interface with serialized parallel, need the 2359 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2360 // and end-parallel events. 2361 if (ompt_enabled.enabled && 2362 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2363 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2364 } 2365 #endif 2366 2367 #if KMP_DEBUG 2368 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2369 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2370 "th_task_team = %p\n", 2371 __kmp_gtid_from_thread(master_th), team, 2372 team->t.t_task_team[master_th->th.th_task_state], 2373 master_th->th.th_task_team)); 2374 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2375 team->t.t_task_team[master_th->th.th_task_state]); 2376 } 2377 #endif 2378 2379 if (team->t.t_serialized) { 2380 if (master_th->th.th_teams_microtask) { 2381 // We are in teams construct 2382 int level = team->t.t_level; 2383 int tlevel = master_th->th.th_teams_level; 2384 if (level == tlevel) { 2385 // AC: we haven't incremented it earlier at start of teams construct, 2386 // so do it here - at the end of teams construct 2387 team->t.t_level++; 2388 } else if (level == tlevel + 1) { 2389 // AC: we are exiting parallel inside teams, need to increment 2390 // serialization in order to restore it in the next call to 2391 // __kmpc_end_serialized_parallel 2392 team->t.t_serialized++; 2393 } 2394 } 2395 __kmpc_end_serialized_parallel(loc, gtid); 2396 2397 #if OMPT_SUPPORT 2398 if (ompt_enabled.enabled) { 2399 __kmp_join_restore_state(master_th, parent_team); 2400 } 2401 #endif 2402 2403 return; 2404 } 2405 2406 master_active = team->t.t_master_active; 2407 2408 if (!exit_teams) { 2409 // AC: No barrier for internal teams at exit from teams construct. 2410 // But there is barrier for external team (league). 2411 __kmp_internal_join(loc, gtid, team); 2412 #if USE_ITT_BUILD 2413 if (__itt_stack_caller_create_ptr) { 2414 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2415 // destroy the stack stitching id after join barrier 2416 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2417 team->t.t_stack_id = NULL; 2418 } 2419 #endif 2420 } else { 2421 master_th->th.th_task_state = 2422 0; // AC: no tasking in teams (out of any parallel) 2423 #if USE_ITT_BUILD 2424 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2425 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2426 // destroy the stack stitching id on exit from the teams construct 2427 // if parent_team is active, then the id will be destroyed later on 2428 // by master of the league of teams 2429 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2430 parent_team->t.t_stack_id = NULL; 2431 } 2432 #endif 2433 2434 if (team->t.t_nproc > 1 && 2435 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2436 team->t.b->update_num_threads(team->t.t_nproc); 2437 __kmp_add_threads_to_team(team, team->t.t_nproc); 2438 } 2439 } 2440 2441 KMP_MB(); 2442 2443 #if OMPT_SUPPORT 2444 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2445 void *codeptr = team->t.ompt_team_info.master_return_address; 2446 #endif 2447 2448 #if USE_ITT_BUILD 2449 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2450 if (team->t.t_active_level == 1 && 2451 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2452 master_th->th.th_teams_size.nteams == 1)) { 2453 master_th->th.th_ident = loc; 2454 // only one notification scheme (either "submit" or "forking/joined", not 2455 // both) 2456 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2457 __kmp_forkjoin_frames_mode == 3) 2458 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2459 master_th->th.th_frame_time, 0, loc, 2460 master_th->th.th_team_nproc, 1); 2461 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2462 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2463 __kmp_itt_region_joined(gtid); 2464 } // active_level == 1 2465 #endif /* USE_ITT_BUILD */ 2466 2467 #if KMP_AFFINITY_SUPPORTED 2468 if (!exit_teams) { 2469 // Restore master thread's partition. 2470 master_th->th.th_first_place = team->t.t_first_place; 2471 master_th->th.th_last_place = team->t.t_last_place; 2472 } 2473 #endif // KMP_AFFINITY_SUPPORTED 2474 2475 if (master_th->th.th_teams_microtask && !exit_teams && 2476 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2477 team->t.t_level == master_th->th.th_teams_level + 1) { 2478 // AC: We need to leave the team structure intact at the end of parallel 2479 // inside the teams construct, so that at the next parallel same (hot) team 2480 // works, only adjust nesting levels 2481 #if OMPT_SUPPORT 2482 ompt_data_t ompt_parallel_data = ompt_data_none; 2483 if (ompt_enabled.enabled) { 2484 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2485 if (ompt_enabled.ompt_callback_implicit_task) { 2486 int ompt_team_size = team->t.t_nproc; 2487 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2488 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2489 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2490 } 2491 task_info->frame.exit_frame = ompt_data_none; 2492 task_info->task_data = ompt_data_none; 2493 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2494 __ompt_lw_taskteam_unlink(master_th); 2495 } 2496 #endif 2497 /* Decrement our nested depth level */ 2498 team->t.t_level--; 2499 team->t.t_active_level--; 2500 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2501 2502 // Restore number of threads in the team if needed. This code relies on 2503 // the proper adjustment of th_teams_size.nth after the fork in 2504 // __kmp_teams_master on each teams primary thread in the case that 2505 // __kmp_reserve_threads reduced it. 2506 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2507 int old_num = master_th->th.th_team_nproc; 2508 int new_num = master_th->th.th_teams_size.nth; 2509 kmp_info_t **other_threads = team->t.t_threads; 2510 team->t.t_nproc = new_num; 2511 for (int i = 0; i < old_num; ++i) { 2512 other_threads[i]->th.th_team_nproc = new_num; 2513 } 2514 // Adjust states of non-used threads of the team 2515 for (int i = old_num; i < new_num; ++i) { 2516 // Re-initialize thread's barrier data. 2517 KMP_DEBUG_ASSERT(other_threads[i]); 2518 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2519 for (int b = 0; b < bs_last_barrier; ++b) { 2520 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2521 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2522 #if USE_DEBUGGER 2523 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2524 #endif 2525 } 2526 if (__kmp_tasking_mode != tskm_immediate_exec) { 2527 // Synchronize thread's task state 2528 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2529 } 2530 } 2531 } 2532 2533 #if OMPT_SUPPORT 2534 if (ompt_enabled.enabled) { 2535 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2536 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2537 } 2538 #endif 2539 2540 return; 2541 } 2542 2543 /* do cleanup and restore the parent team */ 2544 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2545 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2546 2547 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2548 2549 /* jc: The following lock has instructions with REL and ACQ semantics, 2550 separating the parallel user code called in this parallel region 2551 from the serial user code called after this function returns. */ 2552 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2553 2554 if (!master_th->th.th_teams_microtask || 2555 team->t.t_level > master_th->th.th_teams_level) { 2556 /* Decrement our nested depth level */ 2557 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2558 } 2559 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2560 2561 #if OMPT_SUPPORT 2562 if (ompt_enabled.enabled) { 2563 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2564 if (ompt_enabled.ompt_callback_implicit_task) { 2565 int flags = (team_microtask == (void *)__kmp_teams_master) 2566 ? ompt_task_initial 2567 : ompt_task_implicit; 2568 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2569 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2570 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2571 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2572 } 2573 task_info->frame.exit_frame = ompt_data_none; 2574 task_info->task_data = ompt_data_none; 2575 } 2576 #endif 2577 2578 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2579 master_th, team)); 2580 __kmp_pop_current_task_from_thread(master_th); 2581 2582 master_th->th.th_def_allocator = team->t.t_def_allocator; 2583 2584 #if OMPD_SUPPORT 2585 if (ompd_state & OMPD_ENABLE_BP) 2586 ompd_bp_parallel_end(); 2587 #endif 2588 updateHWFPControl(team); 2589 2590 if (root->r.r_active != master_active) 2591 root->r.r_active = master_active; 2592 2593 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2594 master_th)); // this will free worker threads 2595 2596 /* this race was fun to find. make sure the following is in the critical 2597 region otherwise assertions may fail occasionally since the old team may be 2598 reallocated and the hierarchy appears inconsistent. it is actually safe to 2599 run and won't cause any bugs, but will cause those assertion failures. it's 2600 only one deref&assign so might as well put this in the critical region */ 2601 master_th->th.th_team = parent_team; 2602 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2603 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2604 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2605 2606 /* restore serialized team, if need be */ 2607 if (parent_team->t.t_serialized && 2608 parent_team != master_th->th.th_serial_team && 2609 parent_team != root->r.r_root_team) { 2610 __kmp_free_team(root, 2611 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2612 master_th->th.th_serial_team = parent_team; 2613 } 2614 2615 if (__kmp_tasking_mode != tskm_immediate_exec) { 2616 if (master_th->th.th_task_state_top > 2617 0) { // Restore task state from memo stack 2618 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2619 // Remember primary thread's state if we re-use this nested hot team 2620 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2621 master_th->th.th_task_state; 2622 --master_th->th.th_task_state_top; // pop 2623 // Now restore state at this level 2624 master_th->th.th_task_state = 2625 master_th->th 2626 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2627 } 2628 // Copy the task team from the parent team to the primary thread 2629 master_th->th.th_task_team = 2630 parent_team->t.t_task_team[master_th->th.th_task_state]; 2631 KA_TRACE(20, 2632 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2633 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2634 parent_team)); 2635 } 2636 2637 // TODO: GEH - cannot do this assertion because root thread not set up as 2638 // executing 2639 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2640 master_th->th.th_current_task->td_flags.executing = 1; 2641 2642 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2643 2644 #if OMPT_SUPPORT 2645 int flags = 2646 OMPT_INVOKER(fork_context) | 2647 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2648 : ompt_parallel_team); 2649 if (ompt_enabled.enabled) { 2650 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2651 codeptr); 2652 } 2653 #endif 2654 2655 KMP_MB(); 2656 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2657 } 2658 2659 /* Check whether we should push an internal control record onto the 2660 serial team stack. If so, do it. */ 2661 void __kmp_save_internal_controls(kmp_info_t *thread) { 2662 2663 if (thread->th.th_team != thread->th.th_serial_team) { 2664 return; 2665 } 2666 if (thread->th.th_team->t.t_serialized > 1) { 2667 int push = 0; 2668 2669 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2670 push = 1; 2671 } else { 2672 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2673 thread->th.th_team->t.t_serialized) { 2674 push = 1; 2675 } 2676 } 2677 if (push) { /* push a record on the serial team's stack */ 2678 kmp_internal_control_t *control = 2679 (kmp_internal_control_t *)__kmp_allocate( 2680 sizeof(kmp_internal_control_t)); 2681 2682 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2683 2684 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2685 2686 control->next = thread->th.th_team->t.t_control_stack_top; 2687 thread->th.th_team->t.t_control_stack_top = control; 2688 } 2689 } 2690 } 2691 2692 /* Changes set_nproc */ 2693 void __kmp_set_num_threads(int new_nth, int gtid) { 2694 kmp_info_t *thread; 2695 kmp_root_t *root; 2696 2697 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2698 KMP_DEBUG_ASSERT(__kmp_init_serial); 2699 2700 if (new_nth < 1) 2701 new_nth = 1; 2702 else if (new_nth > __kmp_max_nth) 2703 new_nth = __kmp_max_nth; 2704 2705 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2706 thread = __kmp_threads[gtid]; 2707 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2708 return; // nothing to do 2709 2710 __kmp_save_internal_controls(thread); 2711 2712 set__nproc(thread, new_nth); 2713 2714 // If this omp_set_num_threads() call will cause the hot team size to be 2715 // reduced (in the absence of a num_threads clause), then reduce it now, 2716 // rather than waiting for the next parallel region. 2717 root = thread->th.th_root; 2718 if (__kmp_init_parallel && (!root->r.r_active) && 2719 (root->r.r_hot_team->t.t_nproc > new_nth) 2720 #if KMP_NESTED_HOT_TEAMS 2721 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2722 #endif 2723 ) { 2724 kmp_team_t *hot_team = root->r.r_hot_team; 2725 int f; 2726 2727 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2728 2729 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2730 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2731 } 2732 // Release the extra threads we don't need any more. 2733 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2734 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2735 if (__kmp_tasking_mode != tskm_immediate_exec) { 2736 // When decreasing team size, threads no longer in the team should unref 2737 // task team. 2738 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2739 } 2740 __kmp_free_thread(hot_team->t.t_threads[f]); 2741 hot_team->t.t_threads[f] = NULL; 2742 } 2743 hot_team->t.t_nproc = new_nth; 2744 #if KMP_NESTED_HOT_TEAMS 2745 if (thread->th.th_hot_teams) { 2746 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2747 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2748 } 2749 #endif 2750 2751 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2752 hot_team->t.b->update_num_threads(new_nth); 2753 __kmp_add_threads_to_team(hot_team, new_nth); 2754 } 2755 2756 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2757 2758 // Update the t_nproc field in the threads that are still active. 2759 for (f = 0; f < new_nth; f++) { 2760 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2761 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2762 } 2763 // Special flag in case omp_set_num_threads() call 2764 hot_team->t.t_size_changed = -1; 2765 } 2766 } 2767 2768 /* Changes max_active_levels */ 2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2770 kmp_info_t *thread; 2771 2772 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2773 "%d = (%d)\n", 2774 gtid, max_active_levels)); 2775 KMP_DEBUG_ASSERT(__kmp_init_serial); 2776 2777 // validate max_active_levels 2778 if (max_active_levels < 0) { 2779 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2780 // We ignore this call if the user has specified a negative value. 2781 // The current setting won't be changed. The last valid setting will be 2782 // used. A warning will be issued (if warnings are allowed as controlled by 2783 // the KMP_WARNINGS env var). 2784 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2785 "max_active_levels for thread %d = (%d)\n", 2786 gtid, max_active_levels)); 2787 return; 2788 } 2789 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2790 // it's OK, the max_active_levels is within the valid range: [ 0; 2791 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2792 // We allow a zero value. (implementation defined behavior) 2793 } else { 2794 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2795 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2796 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2797 // Current upper limit is MAX_INT. (implementation defined behavior) 2798 // If the input exceeds the upper limit, we correct the input to be the 2799 // upper limit. (implementation defined behavior) 2800 // Actually, the flow should never get here until we use MAX_INT limit. 2801 } 2802 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2803 "max_active_levels for thread %d = (%d)\n", 2804 gtid, max_active_levels)); 2805 2806 thread = __kmp_threads[gtid]; 2807 2808 __kmp_save_internal_controls(thread); 2809 2810 set__max_active_levels(thread, max_active_levels); 2811 } 2812 2813 /* Gets max_active_levels */ 2814 int __kmp_get_max_active_levels(int gtid) { 2815 kmp_info_t *thread; 2816 2817 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2818 KMP_DEBUG_ASSERT(__kmp_init_serial); 2819 2820 thread = __kmp_threads[gtid]; 2821 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2822 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2823 "curtask_maxaclevel=%d\n", 2824 gtid, thread->th.th_current_task, 2825 thread->th.th_current_task->td_icvs.max_active_levels)); 2826 return thread->th.th_current_task->td_icvs.max_active_levels; 2827 } 2828 2829 // nteams-var per-device ICV 2830 void __kmp_set_num_teams(int num_teams) { 2831 if (num_teams > 0) 2832 __kmp_nteams = num_teams; 2833 } 2834 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2835 // teams-thread-limit-var per-device ICV 2836 void __kmp_set_teams_thread_limit(int limit) { 2837 if (limit > 0) 2838 __kmp_teams_thread_limit = limit; 2839 } 2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2841 2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2844 2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2847 kmp_info_t *thread; 2848 kmp_sched_t orig_kind; 2849 // kmp_team_t *team; 2850 2851 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2852 gtid, (int)kind, chunk)); 2853 KMP_DEBUG_ASSERT(__kmp_init_serial); 2854 2855 // Check if the kind parameter is valid, correct if needed. 2856 // Valid parameters should fit in one of two intervals - standard or extended: 2857 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2858 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2859 orig_kind = kind; 2860 kind = __kmp_sched_without_mods(kind); 2861 2862 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2863 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2864 // TODO: Hint needs attention in case we change the default schedule. 2865 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2866 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2867 __kmp_msg_null); 2868 kind = kmp_sched_default; 2869 chunk = 0; // ignore chunk value in case of bad kind 2870 } 2871 2872 thread = __kmp_threads[gtid]; 2873 2874 __kmp_save_internal_controls(thread); 2875 2876 if (kind < kmp_sched_upper_std) { 2877 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2878 // differ static chunked vs. unchunked: chunk should be invalid to 2879 // indicate unchunked schedule (which is the default) 2880 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2881 } else { 2882 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2883 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2884 } 2885 } else { 2886 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2887 // kmp_sched_lower - 2 ]; 2888 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2889 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2890 kmp_sched_lower - 2]; 2891 } 2892 __kmp_sched_apply_mods_intkind( 2893 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2894 if (kind == kmp_sched_auto || chunk < 1) { 2895 // ignore parameter chunk for schedule auto 2896 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2897 } else { 2898 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2899 } 2900 } 2901 2902 /* Gets def_sched_var ICV values */ 2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2904 kmp_info_t *thread; 2905 enum sched_type th_type; 2906 2907 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2908 KMP_DEBUG_ASSERT(__kmp_init_serial); 2909 2910 thread = __kmp_threads[gtid]; 2911 2912 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2913 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2914 case kmp_sch_static: 2915 case kmp_sch_static_greedy: 2916 case kmp_sch_static_balanced: 2917 *kind = kmp_sched_static; 2918 __kmp_sched_apply_mods_stdkind(kind, th_type); 2919 *chunk = 0; // chunk was not set, try to show this fact via zero value 2920 return; 2921 case kmp_sch_static_chunked: 2922 *kind = kmp_sched_static; 2923 break; 2924 case kmp_sch_dynamic_chunked: 2925 *kind = kmp_sched_dynamic; 2926 break; 2927 case kmp_sch_guided_chunked: 2928 case kmp_sch_guided_iterative_chunked: 2929 case kmp_sch_guided_analytical_chunked: 2930 *kind = kmp_sched_guided; 2931 break; 2932 case kmp_sch_auto: 2933 *kind = kmp_sched_auto; 2934 break; 2935 case kmp_sch_trapezoidal: 2936 *kind = kmp_sched_trapezoidal; 2937 break; 2938 #if KMP_STATIC_STEAL_ENABLED 2939 case kmp_sch_static_steal: 2940 *kind = kmp_sched_static_steal; 2941 break; 2942 #endif 2943 default: 2944 KMP_FATAL(UnknownSchedulingType, th_type); 2945 } 2946 2947 __kmp_sched_apply_mods_stdkind(kind, th_type); 2948 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2949 } 2950 2951 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2952 2953 int ii, dd; 2954 kmp_team_t *team; 2955 kmp_info_t *thr; 2956 2957 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2958 KMP_DEBUG_ASSERT(__kmp_init_serial); 2959 2960 // validate level 2961 if (level == 0) 2962 return 0; 2963 if (level < 0) 2964 return -1; 2965 thr = __kmp_threads[gtid]; 2966 team = thr->th.th_team; 2967 ii = team->t.t_level; 2968 if (level > ii) 2969 return -1; 2970 2971 if (thr->th.th_teams_microtask) { 2972 // AC: we are in teams region where multiple nested teams have same level 2973 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2974 if (level <= 2975 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2976 KMP_DEBUG_ASSERT(ii >= tlevel); 2977 // AC: As we need to pass by the teams league, we need to artificially 2978 // increase ii 2979 if (ii == tlevel) { 2980 ii += 2; // three teams have same level 2981 } else { 2982 ii++; // two teams have same level 2983 } 2984 } 2985 } 2986 2987 if (ii == level) 2988 return __kmp_tid_from_gtid(gtid); 2989 2990 dd = team->t.t_serialized; 2991 level++; 2992 while (ii > level) { 2993 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2994 } 2995 if ((team->t.t_serialized) && (!dd)) { 2996 team = team->t.t_parent; 2997 continue; 2998 } 2999 if (ii > level) { 3000 team = team->t.t_parent; 3001 dd = team->t.t_serialized; 3002 ii--; 3003 } 3004 } 3005 3006 return (dd > 1) ? (0) : (team->t.t_master_tid); 3007 } 3008 3009 int __kmp_get_team_size(int gtid, int level) { 3010 3011 int ii, dd; 3012 kmp_team_t *team; 3013 kmp_info_t *thr; 3014 3015 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 3016 KMP_DEBUG_ASSERT(__kmp_init_serial); 3017 3018 // validate level 3019 if (level == 0) 3020 return 1; 3021 if (level < 0) 3022 return -1; 3023 thr = __kmp_threads[gtid]; 3024 team = thr->th.th_team; 3025 ii = team->t.t_level; 3026 if (level > ii) 3027 return -1; 3028 3029 if (thr->th.th_teams_microtask) { 3030 // AC: we are in teams region where multiple nested teams have same level 3031 int tlevel = thr->th.th_teams_level; // the level of the teams construct 3032 if (level <= 3033 tlevel) { // otherwise usual algorithm works (will not touch the teams) 3034 KMP_DEBUG_ASSERT(ii >= tlevel); 3035 // AC: As we need to pass by the teams league, we need to artificially 3036 // increase ii 3037 if (ii == tlevel) { 3038 ii += 2; // three teams have same level 3039 } else { 3040 ii++; // two teams have same level 3041 } 3042 } 3043 } 3044 3045 while (ii > level) { 3046 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3047 } 3048 if (team->t.t_serialized && (!dd)) { 3049 team = team->t.t_parent; 3050 continue; 3051 } 3052 if (ii > level) { 3053 team = team->t.t_parent; 3054 ii--; 3055 } 3056 } 3057 3058 return team->t.t_nproc; 3059 } 3060 3061 kmp_r_sched_t __kmp_get_schedule_global() { 3062 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3063 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3064 // independently. So one can get the updated schedule here. 3065 3066 kmp_r_sched_t r_sched; 3067 3068 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3069 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3070 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3071 // different roots (even in OMP 2.5) 3072 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3073 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3074 if (s == kmp_sch_static) { 3075 // replace STATIC with more detailed schedule (balanced or greedy) 3076 r_sched.r_sched_type = __kmp_static; 3077 } else if (s == kmp_sch_guided_chunked) { 3078 // replace GUIDED with more detailed schedule (iterative or analytical) 3079 r_sched.r_sched_type = __kmp_guided; 3080 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3081 r_sched.r_sched_type = __kmp_sched; 3082 } 3083 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3084 3085 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3086 // __kmp_chunk may be wrong here (if it was not ever set) 3087 r_sched.chunk = KMP_DEFAULT_CHUNK; 3088 } else { 3089 r_sched.chunk = __kmp_chunk; 3090 } 3091 3092 return r_sched; 3093 } 3094 3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3096 at least argc number of *t_argv entries for the requested team. */ 3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3098 3099 KMP_DEBUG_ASSERT(team); 3100 if (!realloc || argc > team->t.t_max_argc) { 3101 3102 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3103 "current entries=%d\n", 3104 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3105 /* if previously allocated heap space for args, free them */ 3106 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3107 __kmp_free((void *)team->t.t_argv); 3108 3109 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3110 /* use unused space in the cache line for arguments */ 3111 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3112 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3113 "argv entries\n", 3114 team->t.t_id, team->t.t_max_argc)); 3115 team->t.t_argv = &team->t.t_inline_argv[0]; 3116 if (__kmp_storage_map) { 3117 __kmp_print_storage_map_gtid( 3118 -1, &team->t.t_inline_argv[0], 3119 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3120 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3121 team->t.t_id); 3122 } 3123 } else { 3124 /* allocate space for arguments in the heap */ 3125 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3126 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3127 : 2 * argc; 3128 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3129 "argv entries\n", 3130 team->t.t_id, team->t.t_max_argc)); 3131 team->t.t_argv = 3132 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3133 if (__kmp_storage_map) { 3134 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3135 &team->t.t_argv[team->t.t_max_argc], 3136 sizeof(void *) * team->t.t_max_argc, 3137 "team_%d.t_argv", team->t.t_id); 3138 } 3139 } 3140 } 3141 } 3142 3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3144 int i; 3145 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3146 team->t.t_threads = 3147 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3148 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3149 sizeof(dispatch_shared_info_t) * num_disp_buff); 3150 team->t.t_dispatch = 3151 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3152 team->t.t_implicit_task_taskdata = 3153 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3154 team->t.t_max_nproc = max_nth; 3155 3156 /* setup dispatch buffers */ 3157 for (i = 0; i < num_disp_buff; ++i) { 3158 team->t.t_disp_buffer[i].buffer_index = i; 3159 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3160 } 3161 } 3162 3163 static void __kmp_free_team_arrays(kmp_team_t *team) { 3164 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3165 int i; 3166 for (i = 0; i < team->t.t_max_nproc; ++i) { 3167 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3168 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3169 team->t.t_dispatch[i].th_disp_buffer = NULL; 3170 } 3171 } 3172 #if KMP_USE_HIER_SCHED 3173 __kmp_dispatch_free_hierarchies(team); 3174 #endif 3175 __kmp_free(team->t.t_threads); 3176 __kmp_free(team->t.t_disp_buffer); 3177 __kmp_free(team->t.t_dispatch); 3178 __kmp_free(team->t.t_implicit_task_taskdata); 3179 team->t.t_threads = NULL; 3180 team->t.t_disp_buffer = NULL; 3181 team->t.t_dispatch = NULL; 3182 team->t.t_implicit_task_taskdata = 0; 3183 } 3184 3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3186 kmp_info_t **oldThreads = team->t.t_threads; 3187 3188 __kmp_free(team->t.t_disp_buffer); 3189 __kmp_free(team->t.t_dispatch); 3190 __kmp_free(team->t.t_implicit_task_taskdata); 3191 __kmp_allocate_team_arrays(team, max_nth); 3192 3193 KMP_MEMCPY(team->t.t_threads, oldThreads, 3194 team->t.t_nproc * sizeof(kmp_info_t *)); 3195 3196 __kmp_free(oldThreads); 3197 } 3198 3199 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3200 3201 kmp_r_sched_t r_sched = 3202 __kmp_get_schedule_global(); // get current state of scheduling globals 3203 3204 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3205 3206 kmp_internal_control_t g_icvs = { 3207 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3208 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3209 // adjustment of threads (per thread) 3210 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3211 // whether blocktime is explicitly set 3212 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3213 #if KMP_USE_MONITOR 3214 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3215 // intervals 3216 #endif 3217 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3218 // next parallel region (per thread) 3219 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3220 __kmp_cg_max_nth, // int thread_limit; 3221 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3222 // for max_active_levels 3223 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3224 // {sched,chunk} pair 3225 __kmp_nested_proc_bind.bind_types[0], 3226 __kmp_default_device, 3227 NULL // struct kmp_internal_control *next; 3228 }; 3229 3230 return g_icvs; 3231 } 3232 3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3234 3235 kmp_internal_control_t gx_icvs; 3236 gx_icvs.serial_nesting_level = 3237 0; // probably =team->t.t_serial like in save_inter_controls 3238 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3239 gx_icvs.next = NULL; 3240 3241 return gx_icvs; 3242 } 3243 3244 static void __kmp_initialize_root(kmp_root_t *root) { 3245 int f; 3246 kmp_team_t *root_team; 3247 kmp_team_t *hot_team; 3248 int hot_team_max_nth; 3249 kmp_r_sched_t r_sched = 3250 __kmp_get_schedule_global(); // get current state of scheduling globals 3251 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3252 KMP_DEBUG_ASSERT(root); 3253 KMP_ASSERT(!root->r.r_begin); 3254 3255 /* setup the root state structure */ 3256 __kmp_init_lock(&root->r.r_begin_lock); 3257 root->r.r_begin = FALSE; 3258 root->r.r_active = FALSE; 3259 root->r.r_in_parallel = 0; 3260 root->r.r_blocktime = __kmp_dflt_blocktime; 3261 #if KMP_AFFINITY_SUPPORTED 3262 root->r.r_affinity_assigned = FALSE; 3263 #endif 3264 3265 /* setup the root team for this task */ 3266 /* allocate the root team structure */ 3267 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3268 3269 root_team = 3270 __kmp_allocate_team(root, 3271 1, // new_nproc 3272 1, // max_nproc 3273 #if OMPT_SUPPORT 3274 ompt_data_none, // root parallel id 3275 #endif 3276 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3277 0 // argc 3278 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3279 ); 3280 #if USE_DEBUGGER 3281 // Non-NULL value should be assigned to make the debugger display the root 3282 // team. 3283 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3284 #endif 3285 3286 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3287 3288 root->r.r_root_team = root_team; 3289 root_team->t.t_control_stack_top = NULL; 3290 3291 /* initialize root team */ 3292 root_team->t.t_threads[0] = NULL; 3293 root_team->t.t_nproc = 1; 3294 root_team->t.t_serialized = 1; 3295 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3296 root_team->t.t_sched.sched = r_sched.sched; 3297 KA_TRACE( 3298 20, 3299 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3300 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3301 3302 /* setup the hot team for this task */ 3303 /* allocate the hot team structure */ 3304 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3305 3306 hot_team = 3307 __kmp_allocate_team(root, 3308 1, // new_nproc 3309 __kmp_dflt_team_nth_ub * 2, // max_nproc 3310 #if OMPT_SUPPORT 3311 ompt_data_none, // root parallel id 3312 #endif 3313 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3314 0 // argc 3315 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3316 ); 3317 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3318 3319 root->r.r_hot_team = hot_team; 3320 root_team->t.t_control_stack_top = NULL; 3321 3322 /* first-time initialization */ 3323 hot_team->t.t_parent = root_team; 3324 3325 /* initialize hot team */ 3326 hot_team_max_nth = hot_team->t.t_max_nproc; 3327 for (f = 0; f < hot_team_max_nth; ++f) { 3328 hot_team->t.t_threads[f] = NULL; 3329 } 3330 hot_team->t.t_nproc = 1; 3331 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3332 hot_team->t.t_sched.sched = r_sched.sched; 3333 hot_team->t.t_size_changed = 0; 3334 } 3335 3336 #ifdef KMP_DEBUG 3337 3338 typedef struct kmp_team_list_item { 3339 kmp_team_p const *entry; 3340 struct kmp_team_list_item *next; 3341 } kmp_team_list_item_t; 3342 typedef kmp_team_list_item_t *kmp_team_list_t; 3343 3344 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3345 kmp_team_list_t list, // List of teams. 3346 kmp_team_p const *team // Team to add. 3347 ) { 3348 3349 // List must terminate with item where both entry and next are NULL. 3350 // Team is added to the list only once. 3351 // List is sorted in ascending order by team id. 3352 // Team id is *not* a key. 3353 3354 kmp_team_list_t l; 3355 3356 KMP_DEBUG_ASSERT(list != NULL); 3357 if (team == NULL) { 3358 return; 3359 } 3360 3361 __kmp_print_structure_team_accum(list, team->t.t_parent); 3362 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3363 3364 // Search list for the team. 3365 l = list; 3366 while (l->next != NULL && l->entry != team) { 3367 l = l->next; 3368 } 3369 if (l->next != NULL) { 3370 return; // Team has been added before, exit. 3371 } 3372 3373 // Team is not found. Search list again for insertion point. 3374 l = list; 3375 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3376 l = l->next; 3377 } 3378 3379 // Insert team. 3380 { 3381 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3382 sizeof(kmp_team_list_item_t)); 3383 *item = *l; 3384 l->entry = team; 3385 l->next = item; 3386 } 3387 } 3388 3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3390 3391 ) { 3392 __kmp_printf("%s", title); 3393 if (team != NULL) { 3394 __kmp_printf("%2x %p\n", team->t.t_id, team); 3395 } else { 3396 __kmp_printf(" - (nil)\n"); 3397 } 3398 } 3399 3400 static void __kmp_print_structure_thread(char const *title, 3401 kmp_info_p const *thread) { 3402 __kmp_printf("%s", title); 3403 if (thread != NULL) { 3404 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3405 } else { 3406 __kmp_printf(" - (nil)\n"); 3407 } 3408 } 3409 3410 void __kmp_print_structure(void) { 3411 3412 kmp_team_list_t list; 3413 3414 // Initialize list of teams. 3415 list = 3416 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3417 list->entry = NULL; 3418 list->next = NULL; 3419 3420 __kmp_printf("\n------------------------------\nGlobal Thread " 3421 "Table\n------------------------------\n"); 3422 { 3423 int gtid; 3424 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3425 __kmp_printf("%2d", gtid); 3426 if (__kmp_threads != NULL) { 3427 __kmp_printf(" %p", __kmp_threads[gtid]); 3428 } 3429 if (__kmp_root != NULL) { 3430 __kmp_printf(" %p", __kmp_root[gtid]); 3431 } 3432 __kmp_printf("\n"); 3433 } 3434 } 3435 3436 // Print out __kmp_threads array. 3437 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3438 "----------\n"); 3439 if (__kmp_threads != NULL) { 3440 int gtid; 3441 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3442 kmp_info_t const *thread = __kmp_threads[gtid]; 3443 if (thread != NULL) { 3444 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3445 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3446 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3447 __kmp_print_structure_team(" Serial Team: ", 3448 thread->th.th_serial_team); 3449 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3450 __kmp_print_structure_thread(" Primary: ", 3451 thread->th.th_team_master); 3452 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3453 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3454 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3455 __kmp_print_structure_thread(" Next in pool: ", 3456 thread->th.th_next_pool); 3457 __kmp_printf("\n"); 3458 __kmp_print_structure_team_accum(list, thread->th.th_team); 3459 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3460 } 3461 } 3462 } else { 3463 __kmp_printf("Threads array is not allocated.\n"); 3464 } 3465 3466 // Print out __kmp_root array. 3467 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3468 "--------\n"); 3469 if (__kmp_root != NULL) { 3470 int gtid; 3471 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3472 kmp_root_t const *root = __kmp_root[gtid]; 3473 if (root != NULL) { 3474 __kmp_printf("GTID %2d %p:\n", gtid, root); 3475 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3476 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3477 __kmp_print_structure_thread(" Uber Thread: ", 3478 root->r.r_uber_thread); 3479 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3480 __kmp_printf(" In Parallel: %2d\n", 3481 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3482 __kmp_printf("\n"); 3483 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3484 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3485 } 3486 } 3487 } else { 3488 __kmp_printf("Ubers array is not allocated.\n"); 3489 } 3490 3491 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3492 "--------\n"); 3493 while (list->next != NULL) { 3494 kmp_team_p const *team = list->entry; 3495 int i; 3496 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3497 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3498 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3499 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3500 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3501 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3502 for (i = 0; i < team->t.t_nproc; ++i) { 3503 __kmp_printf(" Thread %2d: ", i); 3504 __kmp_print_structure_thread("", team->t.t_threads[i]); 3505 } 3506 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3507 __kmp_printf("\n"); 3508 list = list->next; 3509 } 3510 3511 // Print out __kmp_thread_pool and __kmp_team_pool. 3512 __kmp_printf("\n------------------------------\nPools\n----------------------" 3513 "--------\n"); 3514 __kmp_print_structure_thread("Thread pool: ", 3515 CCAST(kmp_info_t *, __kmp_thread_pool)); 3516 __kmp_print_structure_team("Team pool: ", 3517 CCAST(kmp_team_t *, __kmp_team_pool)); 3518 __kmp_printf("\n"); 3519 3520 // Free team list. 3521 while (list != NULL) { 3522 kmp_team_list_item_t *item = list; 3523 list = list->next; 3524 KMP_INTERNAL_FREE(item); 3525 } 3526 } 3527 3528 #endif 3529 3530 //--------------------------------------------------------------------------- 3531 // Stuff for per-thread fast random number generator 3532 // Table of primes 3533 static const unsigned __kmp_primes[] = { 3534 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3535 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3536 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3537 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3538 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3539 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3540 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3541 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3542 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3543 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3544 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3545 3546 //--------------------------------------------------------------------------- 3547 // __kmp_get_random: Get a random number using a linear congruential method. 3548 unsigned short __kmp_get_random(kmp_info_t *thread) { 3549 unsigned x = thread->th.th_x; 3550 unsigned short r = (unsigned short)(x >> 16); 3551 3552 thread->th.th_x = x * thread->th.th_a + 1; 3553 3554 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3555 thread->th.th_info.ds.ds_tid, r)); 3556 3557 return r; 3558 } 3559 //-------------------------------------------------------- 3560 // __kmp_init_random: Initialize a random number generator 3561 void __kmp_init_random(kmp_info_t *thread) { 3562 unsigned seed = thread->th.th_info.ds.ds_tid; 3563 3564 thread->th.th_a = 3565 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3566 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3567 KA_TRACE(30, 3568 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3569 } 3570 3571 #if KMP_OS_WINDOWS 3572 /* reclaim array entries for root threads that are already dead, returns number 3573 * reclaimed */ 3574 static int __kmp_reclaim_dead_roots(void) { 3575 int i, r = 0; 3576 3577 for (i = 0; i < __kmp_threads_capacity; ++i) { 3578 if (KMP_UBER_GTID(i) && 3579 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3580 !__kmp_root[i] 3581 ->r.r_active) { // AC: reclaim only roots died in non-active state 3582 r += __kmp_unregister_root_other_thread(i); 3583 } 3584 } 3585 return r; 3586 } 3587 #endif 3588 3589 /* This function attempts to create free entries in __kmp_threads and 3590 __kmp_root, and returns the number of free entries generated. 3591 3592 For Windows* OS static library, the first mechanism used is to reclaim array 3593 entries for root threads that are already dead. 3594 3595 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3596 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3597 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3598 threadprivate cache array has been created. Synchronization with 3599 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3600 3601 After any dead root reclamation, if the clipping value allows array expansion 3602 to result in the generation of a total of nNeed free slots, the function does 3603 that expansion. If not, nothing is done beyond the possible initial root 3604 thread reclamation. 3605 3606 If any argument is negative, the behavior is undefined. */ 3607 static int __kmp_expand_threads(int nNeed) { 3608 int added = 0; 3609 int minimumRequiredCapacity; 3610 int newCapacity; 3611 kmp_info_t **newThreads; 3612 kmp_root_t **newRoot; 3613 3614 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3615 // resizing __kmp_threads does not need additional protection if foreign 3616 // threads are present 3617 3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3619 /* only for Windows static library */ 3620 /* reclaim array entries for root threads that are already dead */ 3621 added = __kmp_reclaim_dead_roots(); 3622 3623 if (nNeed) { 3624 nNeed -= added; 3625 if (nNeed < 0) 3626 nNeed = 0; 3627 } 3628 #endif 3629 if (nNeed <= 0) 3630 return added; 3631 3632 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3633 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3634 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3635 // > __kmp_max_nth in one of two ways: 3636 // 3637 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3638 // may not be reused by another thread, so we may need to increase 3639 // __kmp_threads_capacity to __kmp_max_nth + 1. 3640 // 3641 // 2) New foreign root(s) are encountered. We always register new foreign 3642 // roots. This may cause a smaller # of threads to be allocated at 3643 // subsequent parallel regions, but the worker threads hang around (and 3644 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3645 // 3646 // Anyway, that is the reason for moving the check to see if 3647 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3648 // instead of having it performed here. -BB 3649 3650 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3651 3652 /* compute expansion headroom to check if we can expand */ 3653 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3654 /* possible expansion too small -- give up */ 3655 return added; 3656 } 3657 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3658 3659 newCapacity = __kmp_threads_capacity; 3660 do { 3661 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3662 : __kmp_sys_max_nth; 3663 } while (newCapacity < minimumRequiredCapacity); 3664 newThreads = (kmp_info_t **)__kmp_allocate( 3665 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3666 newRoot = 3667 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3668 KMP_MEMCPY(newThreads, __kmp_threads, 3669 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3670 KMP_MEMCPY(newRoot, __kmp_root, 3671 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3672 // Put old __kmp_threads array on a list. Any ongoing references to the old 3673 // list will be valid. This list is cleaned up at library shutdown. 3674 kmp_old_threads_list_t *node = 3675 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t)); 3676 node->threads = __kmp_threads; 3677 node->next = __kmp_old_threads_list; 3678 __kmp_old_threads_list = node; 3679 3680 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3681 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3682 added += newCapacity - __kmp_threads_capacity; 3683 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3684 3685 if (newCapacity > __kmp_tp_capacity) { 3686 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3687 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3688 __kmp_threadprivate_resize_cache(newCapacity); 3689 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3690 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3691 } 3692 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3693 } 3694 3695 return added; 3696 } 3697 3698 /* Register the current thread as a root thread and obtain our gtid. We must 3699 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3700 thread that calls from __kmp_do_serial_initialize() */ 3701 int __kmp_register_root(int initial_thread) { 3702 kmp_info_t *root_thread; 3703 kmp_root_t *root; 3704 int gtid; 3705 int capacity; 3706 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3707 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3708 KMP_MB(); 3709 3710 /* 2007-03-02: 3711 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3712 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3713 work as expected -- it may return false (that means there is at least one 3714 empty slot in __kmp_threads array), but it is possible the only free slot 3715 is #0, which is reserved for initial thread and so cannot be used for this 3716 one. Following code workarounds this bug. 3717 3718 However, right solution seems to be not reserving slot #0 for initial 3719 thread because: 3720 (1) there is no magic in slot #0, 3721 (2) we cannot detect initial thread reliably (the first thread which does 3722 serial initialization may be not a real initial thread). 3723 */ 3724 capacity = __kmp_threads_capacity; 3725 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3726 --capacity; 3727 } 3728 3729 // If it is not for initializing the hidden helper team, we need to take 3730 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3731 // in __kmp_threads_capacity. 3732 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3733 capacity -= __kmp_hidden_helper_threads_num; 3734 } 3735 3736 /* see if there are too many threads */ 3737 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3738 if (__kmp_tp_cached) { 3739 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3740 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3741 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3742 } else { 3743 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3744 __kmp_msg_null); 3745 } 3746 } 3747 3748 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3749 // 0: initial thread, also a regular OpenMP thread. 3750 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3751 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3752 // regular OpenMP threads. 3753 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3754 // Find an available thread slot for hidden helper thread. Slots for hidden 3755 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3756 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3757 gtid <= __kmp_hidden_helper_threads_num; 3758 gtid++) 3759 ; 3760 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3761 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3762 "hidden helper thread: T#%d\n", 3763 gtid)); 3764 } else { 3765 /* find an available thread slot */ 3766 // Don't reassign the zero slot since we need that to only be used by 3767 // initial thread. Slots for hidden helper threads should also be skipped. 3768 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3769 gtid = 0; 3770 } else { 3771 for (gtid = __kmp_hidden_helper_threads_num + 1; 3772 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3773 ; 3774 } 3775 KA_TRACE( 3776 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3777 KMP_ASSERT(gtid < __kmp_threads_capacity); 3778 } 3779 3780 /* update global accounting */ 3781 __kmp_all_nth++; 3782 TCW_4(__kmp_nth, __kmp_nth + 1); 3783 3784 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3785 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3786 if (__kmp_adjust_gtid_mode) { 3787 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3788 if (TCR_4(__kmp_gtid_mode) != 2) { 3789 TCW_4(__kmp_gtid_mode, 2); 3790 } 3791 } else { 3792 if (TCR_4(__kmp_gtid_mode) != 1) { 3793 TCW_4(__kmp_gtid_mode, 1); 3794 } 3795 } 3796 } 3797 3798 #ifdef KMP_ADJUST_BLOCKTIME 3799 /* Adjust blocktime to zero if necessary */ 3800 /* Middle initialization might not have occurred yet */ 3801 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3802 if (__kmp_nth > __kmp_avail_proc) { 3803 __kmp_zero_bt = TRUE; 3804 } 3805 } 3806 #endif /* KMP_ADJUST_BLOCKTIME */ 3807 3808 /* setup this new hierarchy */ 3809 if (!(root = __kmp_root[gtid])) { 3810 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3811 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3812 } 3813 3814 #if KMP_STATS_ENABLED 3815 // Initialize stats as soon as possible (right after gtid assignment). 3816 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3817 __kmp_stats_thread_ptr->startLife(); 3818 KMP_SET_THREAD_STATE(SERIAL_REGION); 3819 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3820 #endif 3821 __kmp_initialize_root(root); 3822 3823 /* setup new root thread structure */ 3824 if (root->r.r_uber_thread) { 3825 root_thread = root->r.r_uber_thread; 3826 } else { 3827 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3828 if (__kmp_storage_map) { 3829 __kmp_print_thread_storage_map(root_thread, gtid); 3830 } 3831 root_thread->th.th_info.ds.ds_gtid = gtid; 3832 #if OMPT_SUPPORT 3833 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3834 #endif 3835 root_thread->th.th_root = root; 3836 if (__kmp_env_consistency_check) { 3837 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3838 } 3839 #if USE_FAST_MEMORY 3840 __kmp_initialize_fast_memory(root_thread); 3841 #endif /* USE_FAST_MEMORY */ 3842 3843 #if KMP_USE_BGET 3844 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3845 __kmp_initialize_bget(root_thread); 3846 #endif 3847 __kmp_init_random(root_thread); // Initialize random number generator 3848 } 3849 3850 /* setup the serial team held in reserve by the root thread */ 3851 if (!root_thread->th.th_serial_team) { 3852 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3853 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3854 root_thread->th.th_serial_team = __kmp_allocate_team( 3855 root, 1, 1, 3856 #if OMPT_SUPPORT 3857 ompt_data_none, // root parallel id 3858 #endif 3859 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3860 } 3861 KMP_ASSERT(root_thread->th.th_serial_team); 3862 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3863 root_thread->th.th_serial_team)); 3864 3865 /* drop root_thread into place */ 3866 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3867 3868 root->r.r_root_team->t.t_threads[0] = root_thread; 3869 root->r.r_hot_team->t.t_threads[0] = root_thread; 3870 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3871 // AC: the team created in reserve, not for execution (it is unused for now). 3872 root_thread->th.th_serial_team->t.t_serialized = 0; 3873 root->r.r_uber_thread = root_thread; 3874 3875 /* initialize the thread, get it ready to go */ 3876 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3877 TCW_4(__kmp_init_gtid, TRUE); 3878 3879 /* prepare the primary thread for get_gtid() */ 3880 __kmp_gtid_set_specific(gtid); 3881 3882 #if USE_ITT_BUILD 3883 __kmp_itt_thread_name(gtid); 3884 #endif /* USE_ITT_BUILD */ 3885 3886 #ifdef KMP_TDATA_GTID 3887 __kmp_gtid = gtid; 3888 #endif 3889 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3890 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3891 3892 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3893 "plain=%u\n", 3894 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3895 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3896 KMP_INIT_BARRIER_STATE)); 3897 { // Initialize barrier data. 3898 int b; 3899 for (b = 0; b < bs_last_barrier; ++b) { 3900 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3901 #if USE_DEBUGGER 3902 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3903 #endif 3904 } 3905 } 3906 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3907 KMP_INIT_BARRIER_STATE); 3908 3909 #if KMP_AFFINITY_SUPPORTED 3910 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3911 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3912 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3913 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3914 #endif /* KMP_AFFINITY_SUPPORTED */ 3915 root_thread->th.th_def_allocator = __kmp_def_allocator; 3916 root_thread->th.th_prev_level = 0; 3917 root_thread->th.th_prev_num_threads = 1; 3918 3919 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3920 tmp->cg_root = root_thread; 3921 tmp->cg_thread_limit = __kmp_cg_max_nth; 3922 tmp->cg_nthreads = 1; 3923 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3924 " cg_nthreads init to 1\n", 3925 root_thread, tmp)); 3926 tmp->up = NULL; 3927 root_thread->th.th_cg_roots = tmp; 3928 3929 __kmp_root_counter++; 3930 3931 #if OMPT_SUPPORT 3932 if (!initial_thread && ompt_enabled.enabled) { 3933 3934 kmp_info_t *root_thread = ompt_get_thread(); 3935 3936 ompt_set_thread_state(root_thread, ompt_state_overhead); 3937 3938 if (ompt_enabled.ompt_callback_thread_begin) { 3939 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3940 ompt_thread_initial, __ompt_get_thread_data_internal()); 3941 } 3942 ompt_data_t *task_data; 3943 ompt_data_t *parallel_data; 3944 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3945 NULL); 3946 if (ompt_enabled.ompt_callback_implicit_task) { 3947 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3948 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3949 } 3950 3951 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3952 } 3953 #endif 3954 #if OMPD_SUPPORT 3955 if (ompd_state & OMPD_ENABLE_BP) 3956 ompd_bp_thread_begin(); 3957 #endif 3958 3959 KMP_MB(); 3960 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3961 3962 return gtid; 3963 } 3964 3965 #if KMP_NESTED_HOT_TEAMS 3966 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3967 const int max_level) { 3968 int i, n, nth; 3969 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3970 if (!hot_teams || !hot_teams[level].hot_team) { 3971 return 0; 3972 } 3973 KMP_DEBUG_ASSERT(level < max_level); 3974 kmp_team_t *team = hot_teams[level].hot_team; 3975 nth = hot_teams[level].hot_team_nth; 3976 n = nth - 1; // primary thread is not freed 3977 if (level < max_level - 1) { 3978 for (i = 0; i < nth; ++i) { 3979 kmp_info_t *th = team->t.t_threads[i]; 3980 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3981 if (i > 0 && th->th.th_hot_teams) { 3982 __kmp_free(th->th.th_hot_teams); 3983 th->th.th_hot_teams = NULL; 3984 } 3985 } 3986 } 3987 __kmp_free_team(root, team, NULL); 3988 return n; 3989 } 3990 #endif 3991 3992 // Resets a root thread and clear its root and hot teams. 3993 // Returns the number of __kmp_threads entries directly and indirectly freed. 3994 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3995 kmp_team_t *root_team = root->r.r_root_team; 3996 kmp_team_t *hot_team = root->r.r_hot_team; 3997 int n = hot_team->t.t_nproc; 3998 int i; 3999 4000 KMP_DEBUG_ASSERT(!root->r.r_active); 4001 4002 root->r.r_root_team = NULL; 4003 root->r.r_hot_team = NULL; 4004 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 4005 // before call to __kmp_free_team(). 4006 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 4007 #if KMP_NESTED_HOT_TEAMS 4008 if (__kmp_hot_teams_max_level > 4009 0) { // need to free nested hot teams and their threads if any 4010 for (i = 0; i < hot_team->t.t_nproc; ++i) { 4011 kmp_info_t *th = hot_team->t.t_threads[i]; 4012 if (__kmp_hot_teams_max_level > 1) { 4013 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 4014 } 4015 if (th->th.th_hot_teams) { 4016 __kmp_free(th->th.th_hot_teams); 4017 th->th.th_hot_teams = NULL; 4018 } 4019 } 4020 } 4021 #endif 4022 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 4023 4024 // Before we can reap the thread, we need to make certain that all other 4025 // threads in the teams that had this root as ancestor have stopped trying to 4026 // steal tasks. 4027 if (__kmp_tasking_mode != tskm_immediate_exec) { 4028 __kmp_wait_to_unref_task_teams(); 4029 } 4030 4031 #if KMP_OS_WINDOWS 4032 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 4033 KA_TRACE( 4034 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 4035 "\n", 4036 (LPVOID) & (root->r.r_uber_thread->th), 4037 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 4038 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 4039 #endif /* KMP_OS_WINDOWS */ 4040 4041 #if OMPD_SUPPORT 4042 if (ompd_state & OMPD_ENABLE_BP) 4043 ompd_bp_thread_end(); 4044 #endif 4045 4046 #if OMPT_SUPPORT 4047 ompt_data_t *task_data; 4048 ompt_data_t *parallel_data; 4049 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 4050 NULL); 4051 if (ompt_enabled.ompt_callback_implicit_task) { 4052 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 4053 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 4054 } 4055 if (ompt_enabled.ompt_callback_thread_end) { 4056 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 4057 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 4058 } 4059 #endif 4060 4061 TCW_4(__kmp_nth, 4062 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4063 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4064 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4065 " to %d\n", 4066 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4067 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4068 if (i == 1) { 4069 // need to free contention group structure 4070 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4071 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4072 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4073 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4074 root->r.r_uber_thread->th.th_cg_roots = NULL; 4075 } 4076 __kmp_reap_thread(root->r.r_uber_thread, 1); 4077 4078 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4079 // instead of freeing. 4080 root->r.r_uber_thread = NULL; 4081 /* mark root as no longer in use */ 4082 root->r.r_begin = FALSE; 4083 4084 return n; 4085 } 4086 4087 void __kmp_unregister_root_current_thread(int gtid) { 4088 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4089 /* this lock should be ok, since unregister_root_current_thread is never 4090 called during an abort, only during a normal close. furthermore, if you 4091 have the forkjoin lock, you should never try to get the initz lock */ 4092 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4093 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4094 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4095 "exiting T#%d\n", 4096 gtid)); 4097 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4098 return; 4099 } 4100 kmp_root_t *root = __kmp_root[gtid]; 4101 4102 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4103 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4104 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4105 KMP_ASSERT(root->r.r_active == FALSE); 4106 4107 KMP_MB(); 4108 4109 kmp_info_t *thread = __kmp_threads[gtid]; 4110 kmp_team_t *team = thread->th.th_team; 4111 kmp_task_team_t *task_team = thread->th.th_task_team; 4112 4113 // we need to wait for the proxy tasks before finishing the thread 4114 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks || 4115 task_team->tt.tt_hidden_helper_task_encountered)) { 4116 #if OMPT_SUPPORT 4117 // the runtime is shutting down so we won't report any events 4118 thread->th.ompt_thread_info.state = ompt_state_undefined; 4119 #endif 4120 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4121 } 4122 4123 __kmp_reset_root(gtid, root); 4124 4125 KMP_MB(); 4126 KC_TRACE(10, 4127 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4128 4129 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4130 } 4131 4132 #if KMP_OS_WINDOWS 4133 /* __kmp_forkjoin_lock must be already held 4134 Unregisters a root thread that is not the current thread. Returns the number 4135 of __kmp_threads entries freed as a result. */ 4136 static int __kmp_unregister_root_other_thread(int gtid) { 4137 kmp_root_t *root = __kmp_root[gtid]; 4138 int r; 4139 4140 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4141 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4142 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4143 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4144 KMP_ASSERT(root->r.r_active == FALSE); 4145 4146 r = __kmp_reset_root(gtid, root); 4147 KC_TRACE(10, 4148 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4149 return r; 4150 } 4151 #endif 4152 4153 #if KMP_DEBUG 4154 void __kmp_task_info() { 4155 4156 kmp_int32 gtid = __kmp_entry_gtid(); 4157 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4158 kmp_info_t *this_thr = __kmp_threads[gtid]; 4159 kmp_team_t *steam = this_thr->th.th_serial_team; 4160 kmp_team_t *team = this_thr->th.th_team; 4161 4162 __kmp_printf( 4163 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4164 "ptask=%p\n", 4165 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4166 team->t.t_implicit_task_taskdata[tid].td_parent); 4167 } 4168 #endif // KMP_DEBUG 4169 4170 /* TODO optimize with one big memclr, take out what isn't needed, split 4171 responsibility to workers as much as possible, and delay initialization of 4172 features as much as possible */ 4173 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4174 int tid, int gtid) { 4175 /* this_thr->th.th_info.ds.ds_gtid is setup in 4176 kmp_allocate_thread/create_worker. 4177 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4178 KMP_DEBUG_ASSERT(this_thr != NULL); 4179 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4180 KMP_DEBUG_ASSERT(team); 4181 KMP_DEBUG_ASSERT(team->t.t_threads); 4182 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4183 kmp_info_t *master = team->t.t_threads[0]; 4184 KMP_DEBUG_ASSERT(master); 4185 KMP_DEBUG_ASSERT(master->th.th_root); 4186 4187 KMP_MB(); 4188 4189 TCW_SYNC_PTR(this_thr->th.th_team, team); 4190 4191 this_thr->th.th_info.ds.ds_tid = tid; 4192 this_thr->th.th_set_nproc = 0; 4193 if (__kmp_tasking_mode != tskm_immediate_exec) 4194 // When tasking is possible, threads are not safe to reap until they are 4195 // done tasking; this will be set when tasking code is exited in wait 4196 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4197 else // no tasking --> always safe to reap 4198 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4199 this_thr->th.th_set_proc_bind = proc_bind_default; 4200 #if KMP_AFFINITY_SUPPORTED 4201 this_thr->th.th_new_place = this_thr->th.th_current_place; 4202 #endif 4203 this_thr->th.th_root = master->th.th_root; 4204 4205 /* setup the thread's cache of the team structure */ 4206 this_thr->th.th_team_nproc = team->t.t_nproc; 4207 this_thr->th.th_team_master = master; 4208 this_thr->th.th_team_serialized = team->t.t_serialized; 4209 4210 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4211 4212 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4213 tid, gtid, this_thr, this_thr->th.th_current_task)); 4214 4215 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4216 team, tid, TRUE); 4217 4218 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4219 tid, gtid, this_thr, this_thr->th.th_current_task)); 4220 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4221 // __kmp_initialize_team()? 4222 4223 /* TODO no worksharing in speculative threads */ 4224 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4225 4226 this_thr->th.th_local.this_construct = 0; 4227 4228 if (!this_thr->th.th_pri_common) { 4229 this_thr->th.th_pri_common = 4230 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4231 if (__kmp_storage_map) { 4232 __kmp_print_storage_map_gtid( 4233 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4234 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4235 } 4236 this_thr->th.th_pri_head = NULL; 4237 } 4238 4239 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4240 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4241 // Make new thread's CG root same as primary thread's 4242 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4243 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4244 if (tmp) { 4245 // worker changes CG, need to check if old CG should be freed 4246 int i = tmp->cg_nthreads--; 4247 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4248 " on node %p of thread %p to %d\n", 4249 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4250 if (i == 1) { 4251 __kmp_free(tmp); // last thread left CG --> free it 4252 } 4253 } 4254 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4255 // Increment new thread's CG root's counter to add the new thread 4256 this_thr->th.th_cg_roots->cg_nthreads++; 4257 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4258 " node %p of thread %p to %d\n", 4259 this_thr, this_thr->th.th_cg_roots, 4260 this_thr->th.th_cg_roots->cg_root, 4261 this_thr->th.th_cg_roots->cg_nthreads)); 4262 this_thr->th.th_current_task->td_icvs.thread_limit = 4263 this_thr->th.th_cg_roots->cg_thread_limit; 4264 } 4265 4266 /* Initialize dynamic dispatch */ 4267 { 4268 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4269 // Use team max_nproc since this will never change for the team. 4270 size_t disp_size = 4271 sizeof(dispatch_private_info_t) * 4272 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4273 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4274 team->t.t_max_nproc)); 4275 KMP_ASSERT(dispatch); 4276 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4277 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4278 4279 dispatch->th_disp_index = 0; 4280 dispatch->th_doacross_buf_idx = 0; 4281 if (!dispatch->th_disp_buffer) { 4282 dispatch->th_disp_buffer = 4283 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4284 4285 if (__kmp_storage_map) { 4286 __kmp_print_storage_map_gtid( 4287 gtid, &dispatch->th_disp_buffer[0], 4288 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4289 ? 1 4290 : __kmp_dispatch_num_buffers], 4291 disp_size, 4292 "th_%d.th_dispatch.th_disp_buffer " 4293 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4294 gtid, team->t.t_id, gtid); 4295 } 4296 } else { 4297 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4298 } 4299 4300 dispatch->th_dispatch_pr_current = 0; 4301 dispatch->th_dispatch_sh_current = 0; 4302 4303 dispatch->th_deo_fcn = 0; /* ORDERED */ 4304 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4305 } 4306 4307 this_thr->th.th_next_pool = NULL; 4308 4309 if (!this_thr->th.th_task_state_memo_stack) { 4310 size_t i; 4311 this_thr->th.th_task_state_memo_stack = 4312 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4313 this_thr->th.th_task_state_top = 0; 4314 this_thr->th.th_task_state_stack_sz = 4; 4315 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4316 ++i) // zero init the stack 4317 this_thr->th.th_task_state_memo_stack[i] = 0; 4318 } 4319 4320 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4321 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4322 4323 KMP_MB(); 4324 } 4325 4326 /* allocate a new thread for the requesting team. this is only called from 4327 within a forkjoin critical section. we will first try to get an available 4328 thread from the thread pool. if none is available, we will fork a new one 4329 assuming we are able to create a new one. this should be assured, as the 4330 caller should check on this first. */ 4331 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4332 int new_tid) { 4333 kmp_team_t *serial_team; 4334 kmp_info_t *new_thr; 4335 int new_gtid; 4336 4337 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4338 KMP_DEBUG_ASSERT(root && team); 4339 #if !KMP_NESTED_HOT_TEAMS 4340 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4341 #endif 4342 KMP_MB(); 4343 4344 /* first, try to get one from the thread pool */ 4345 if (__kmp_thread_pool) { 4346 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4347 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4348 if (new_thr == __kmp_thread_pool_insert_pt) { 4349 __kmp_thread_pool_insert_pt = NULL; 4350 } 4351 TCW_4(new_thr->th.th_in_pool, FALSE); 4352 __kmp_suspend_initialize_thread(new_thr); 4353 __kmp_lock_suspend_mx(new_thr); 4354 if (new_thr->th.th_active_in_pool == TRUE) { 4355 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4356 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4357 new_thr->th.th_active_in_pool = FALSE; 4358 } 4359 __kmp_unlock_suspend_mx(new_thr); 4360 4361 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4362 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4363 KMP_ASSERT(!new_thr->th.th_team); 4364 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4365 4366 /* setup the thread structure */ 4367 __kmp_initialize_info(new_thr, team, new_tid, 4368 new_thr->th.th_info.ds.ds_gtid); 4369 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4370 4371 TCW_4(__kmp_nth, __kmp_nth + 1); 4372 4373 new_thr->th.th_task_state = 0; 4374 new_thr->th.th_task_state_top = 0; 4375 new_thr->th.th_task_state_stack_sz = 4; 4376 4377 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4378 // Make sure pool thread has transitioned to waiting on own thread struct 4379 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4380 // Thread activated in __kmp_allocate_team when increasing team size 4381 } 4382 4383 #ifdef KMP_ADJUST_BLOCKTIME 4384 /* Adjust blocktime back to zero if necessary */ 4385 /* Middle initialization might not have occurred yet */ 4386 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4387 if (__kmp_nth > __kmp_avail_proc) { 4388 __kmp_zero_bt = TRUE; 4389 } 4390 } 4391 #endif /* KMP_ADJUST_BLOCKTIME */ 4392 4393 #if KMP_DEBUG 4394 // If thread entered pool via __kmp_free_thread, wait_flag should != 4395 // KMP_BARRIER_PARENT_FLAG. 4396 int b; 4397 kmp_balign_t *balign = new_thr->th.th_bar; 4398 for (b = 0; b < bs_last_barrier; ++b) 4399 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4400 #endif 4401 4402 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4403 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4404 4405 KMP_MB(); 4406 return new_thr; 4407 } 4408 4409 /* no, well fork a new one */ 4410 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4411 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4412 4413 #if KMP_USE_MONITOR 4414 // If this is the first worker thread the RTL is creating, then also 4415 // launch the monitor thread. We try to do this as early as possible. 4416 if (!TCR_4(__kmp_init_monitor)) { 4417 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4418 if (!TCR_4(__kmp_init_monitor)) { 4419 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4420 TCW_4(__kmp_init_monitor, 1); 4421 __kmp_create_monitor(&__kmp_monitor); 4422 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4423 #if KMP_OS_WINDOWS 4424 // AC: wait until monitor has started. This is a fix for CQ232808. 4425 // The reason is that if the library is loaded/unloaded in a loop with 4426 // small (parallel) work in between, then there is high probability that 4427 // monitor thread started after the library shutdown. At shutdown it is 4428 // too late to cope with the problem, because when the primary thread is 4429 // in DllMain (process detach) the monitor has no chances to start (it is 4430 // blocked), and primary thread has no means to inform the monitor that 4431 // the library has gone, because all the memory which the monitor can 4432 // access is going to be released/reset. 4433 while (TCR_4(__kmp_init_monitor) < 2) { 4434 KMP_YIELD(TRUE); 4435 } 4436 KF_TRACE(10, ("after monitor thread has started\n")); 4437 #endif 4438 } 4439 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4440 } 4441 #endif 4442 4443 KMP_MB(); 4444 4445 { 4446 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4447 ? 1 4448 : __kmp_hidden_helper_threads_num + 1; 4449 4450 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4451 ++new_gtid) { 4452 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4453 } 4454 4455 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4456 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4457 } 4458 } 4459 4460 /* allocate space for it. */ 4461 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4462 4463 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4464 4465 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4466 // suppress race conditions detection on synchronization flags in debug mode 4467 // this helps to analyze library internals eliminating false positives 4468 __itt_suppress_mark_range( 4469 __itt_suppress_range, __itt_suppress_threading_errors, 4470 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4471 __itt_suppress_mark_range( 4472 __itt_suppress_range, __itt_suppress_threading_errors, 4473 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4474 #if KMP_OS_WINDOWS 4475 __itt_suppress_mark_range( 4476 __itt_suppress_range, __itt_suppress_threading_errors, 4477 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4478 #else 4479 __itt_suppress_mark_range(__itt_suppress_range, 4480 __itt_suppress_threading_errors, 4481 &new_thr->th.th_suspend_init_count, 4482 sizeof(new_thr->th.th_suspend_init_count)); 4483 #endif 4484 // TODO: check if we need to also suppress b_arrived flags 4485 __itt_suppress_mark_range(__itt_suppress_range, 4486 __itt_suppress_threading_errors, 4487 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4488 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4489 __itt_suppress_mark_range(__itt_suppress_range, 4490 __itt_suppress_threading_errors, 4491 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4492 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4493 __itt_suppress_mark_range(__itt_suppress_range, 4494 __itt_suppress_threading_errors, 4495 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4496 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4497 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4498 if (__kmp_storage_map) { 4499 __kmp_print_thread_storage_map(new_thr, new_gtid); 4500 } 4501 4502 // add the reserve serialized team, initialized from the team's primary thread 4503 { 4504 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4505 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4506 new_thr->th.th_serial_team = serial_team = 4507 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4508 #if OMPT_SUPPORT 4509 ompt_data_none, // root parallel id 4510 #endif 4511 proc_bind_default, &r_icvs, 4512 0 USE_NESTED_HOT_ARG(NULL)); 4513 } 4514 KMP_ASSERT(serial_team); 4515 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4516 // execution (it is unused for now). 4517 serial_team->t.t_threads[0] = new_thr; 4518 KF_TRACE(10, 4519 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4520 new_thr)); 4521 4522 /* setup the thread structures */ 4523 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4524 4525 #if USE_FAST_MEMORY 4526 __kmp_initialize_fast_memory(new_thr); 4527 #endif /* USE_FAST_MEMORY */ 4528 4529 #if KMP_USE_BGET 4530 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4531 __kmp_initialize_bget(new_thr); 4532 #endif 4533 4534 __kmp_init_random(new_thr); // Initialize random number generator 4535 4536 /* Initialize these only once when thread is grabbed for a team allocation */ 4537 KA_TRACE(20, 4538 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4539 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4540 4541 int b; 4542 kmp_balign_t *balign = new_thr->th.th_bar; 4543 for (b = 0; b < bs_last_barrier; ++b) { 4544 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4545 balign[b].bb.team = NULL; 4546 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4547 balign[b].bb.use_oncore_barrier = 0; 4548 } 4549 4550 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4551 new_thr->th.th_sleep_loc_type = flag_unset; 4552 4553 new_thr->th.th_spin_here = FALSE; 4554 new_thr->th.th_next_waiting = 0; 4555 #if KMP_OS_UNIX 4556 new_thr->th.th_blocking = false; 4557 #endif 4558 4559 #if KMP_AFFINITY_SUPPORTED 4560 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4561 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4562 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4563 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4564 #endif 4565 new_thr->th.th_def_allocator = __kmp_def_allocator; 4566 new_thr->th.th_prev_level = 0; 4567 new_thr->th.th_prev_num_threads = 1; 4568 4569 TCW_4(new_thr->th.th_in_pool, FALSE); 4570 new_thr->th.th_active_in_pool = FALSE; 4571 TCW_4(new_thr->th.th_active, TRUE); 4572 4573 /* adjust the global counters */ 4574 __kmp_all_nth++; 4575 __kmp_nth++; 4576 4577 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4578 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4579 if (__kmp_adjust_gtid_mode) { 4580 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4581 if (TCR_4(__kmp_gtid_mode) != 2) { 4582 TCW_4(__kmp_gtid_mode, 2); 4583 } 4584 } else { 4585 if (TCR_4(__kmp_gtid_mode) != 1) { 4586 TCW_4(__kmp_gtid_mode, 1); 4587 } 4588 } 4589 } 4590 4591 #ifdef KMP_ADJUST_BLOCKTIME 4592 /* Adjust blocktime back to zero if necessary */ 4593 /* Middle initialization might not have occurred yet */ 4594 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4595 if (__kmp_nth > __kmp_avail_proc) { 4596 __kmp_zero_bt = TRUE; 4597 } 4598 } 4599 #endif /* KMP_ADJUST_BLOCKTIME */ 4600 4601 /* actually fork it and create the new worker thread */ 4602 KF_TRACE( 4603 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4604 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4605 KF_TRACE(10, 4606 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4607 4608 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4609 new_gtid)); 4610 KMP_MB(); 4611 return new_thr; 4612 } 4613 4614 /* Reinitialize team for reuse. 4615 The hot team code calls this case at every fork barrier, so EPCC barrier 4616 test are extremely sensitive to changes in it, esp. writes to the team 4617 struct, which cause a cache invalidation in all threads. 4618 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4619 static void __kmp_reinitialize_team(kmp_team_t *team, 4620 kmp_internal_control_t *new_icvs, 4621 ident_t *loc) { 4622 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4623 team->t.t_threads[0], team)); 4624 KMP_DEBUG_ASSERT(team && new_icvs); 4625 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4626 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4627 4628 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4629 // Copy ICVs to the primary thread's implicit taskdata 4630 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4631 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4632 4633 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4634 team->t.t_threads[0], team)); 4635 } 4636 4637 /* Initialize the team data structure. 4638 This assumes the t_threads and t_max_nproc are already set. 4639 Also, we don't touch the arguments */ 4640 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4641 kmp_internal_control_t *new_icvs, 4642 ident_t *loc) { 4643 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4644 4645 /* verify */ 4646 KMP_DEBUG_ASSERT(team); 4647 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4648 KMP_DEBUG_ASSERT(team->t.t_threads); 4649 KMP_MB(); 4650 4651 team->t.t_master_tid = 0; /* not needed */ 4652 /* team->t.t_master_bar; not needed */ 4653 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4654 team->t.t_nproc = new_nproc; 4655 4656 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4657 team->t.t_next_pool = NULL; 4658 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4659 * up hot team */ 4660 4661 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4662 team->t.t_invoke = NULL; /* not needed */ 4663 4664 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4665 team->t.t_sched.sched = new_icvs->sched.sched; 4666 4667 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4668 team->t.t_fp_control_saved = FALSE; /* not needed */ 4669 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4670 team->t.t_mxcsr = 0; /* not needed */ 4671 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4672 4673 team->t.t_construct = 0; 4674 4675 team->t.t_ordered.dt.t_value = 0; 4676 team->t.t_master_active = FALSE; 4677 4678 #ifdef KMP_DEBUG 4679 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4680 #endif 4681 #if KMP_OS_WINDOWS 4682 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4683 #endif 4684 4685 team->t.t_control_stack_top = NULL; 4686 4687 __kmp_reinitialize_team(team, new_icvs, loc); 4688 4689 KMP_MB(); 4690 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4691 } 4692 4693 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4694 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4695 static void 4696 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4697 if (KMP_AFFINITY_CAPABLE()) { 4698 int status; 4699 if (old_mask != NULL) { 4700 status = __kmp_get_system_affinity(old_mask, TRUE); 4701 int error = errno; 4702 if (status != 0) { 4703 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4704 __kmp_msg_null); 4705 } 4706 } 4707 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4708 } 4709 } 4710 #endif 4711 4712 #if KMP_AFFINITY_SUPPORTED 4713 4714 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4715 // It calculates the worker + primary thread's partition based upon the parent 4716 // thread's partition, and binds each worker to a thread in their partition. 4717 // The primary thread's partition should already include its current binding. 4718 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4719 // Do not partition places for the hidden helper team 4720 if (KMP_HIDDEN_HELPER_TEAM(team)) 4721 return; 4722 // Copy the primary thread's place partition to the team struct 4723 kmp_info_t *master_th = team->t.t_threads[0]; 4724 KMP_DEBUG_ASSERT(master_th != NULL); 4725 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4726 int first_place = master_th->th.th_first_place; 4727 int last_place = master_th->th.th_last_place; 4728 int masters_place = master_th->th.th_current_place; 4729 team->t.t_first_place = first_place; 4730 team->t.t_last_place = last_place; 4731 4732 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4733 "bound to place %d partition = [%d,%d]\n", 4734 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4735 team->t.t_id, masters_place, first_place, last_place)); 4736 4737 switch (proc_bind) { 4738 4739 case proc_bind_default: 4740 // Serial teams might have the proc_bind policy set to proc_bind_default. 4741 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4742 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4743 break; 4744 4745 case proc_bind_primary: { 4746 int f; 4747 int n_th = team->t.t_nproc; 4748 for (f = 1; f < n_th; f++) { 4749 kmp_info_t *th = team->t.t_threads[f]; 4750 KMP_DEBUG_ASSERT(th != NULL); 4751 th->th.th_first_place = first_place; 4752 th->th.th_last_place = last_place; 4753 th->th.th_new_place = masters_place; 4754 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4755 team->t.t_display_affinity != 1) { 4756 team->t.t_display_affinity = 1; 4757 } 4758 4759 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4760 "partition = [%d,%d]\n", 4761 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4762 f, masters_place, first_place, last_place)); 4763 } 4764 } break; 4765 4766 case proc_bind_close: { 4767 int f; 4768 int n_th = team->t.t_nproc; 4769 int n_places; 4770 if (first_place <= last_place) { 4771 n_places = last_place - first_place + 1; 4772 } else { 4773 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4774 } 4775 if (n_th <= n_places) { 4776 int place = masters_place; 4777 for (f = 1; f < n_th; f++) { 4778 kmp_info_t *th = team->t.t_threads[f]; 4779 KMP_DEBUG_ASSERT(th != NULL); 4780 4781 if (place == last_place) { 4782 place = first_place; 4783 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4784 place = 0; 4785 } else { 4786 place++; 4787 } 4788 th->th.th_first_place = first_place; 4789 th->th.th_last_place = last_place; 4790 th->th.th_new_place = place; 4791 if (__kmp_display_affinity && place != th->th.th_current_place && 4792 team->t.t_display_affinity != 1) { 4793 team->t.t_display_affinity = 1; 4794 } 4795 4796 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4797 "partition = [%d,%d]\n", 4798 __kmp_gtid_from_thread(team->t.t_threads[f]), 4799 team->t.t_id, f, place, first_place, last_place)); 4800 } 4801 } else { 4802 int S, rem, gap, s_count; 4803 S = n_th / n_places; 4804 s_count = 0; 4805 rem = n_th - (S * n_places); 4806 gap = rem > 0 ? n_places / rem : n_places; 4807 int place = masters_place; 4808 int gap_ct = gap; 4809 for (f = 0; f < n_th; f++) { 4810 kmp_info_t *th = team->t.t_threads[f]; 4811 KMP_DEBUG_ASSERT(th != NULL); 4812 4813 th->th.th_first_place = first_place; 4814 th->th.th_last_place = last_place; 4815 th->th.th_new_place = place; 4816 if (__kmp_display_affinity && place != th->th.th_current_place && 4817 team->t.t_display_affinity != 1) { 4818 team->t.t_display_affinity = 1; 4819 } 4820 s_count++; 4821 4822 if ((s_count == S) && rem && (gap_ct == gap)) { 4823 // do nothing, add an extra thread to place on next iteration 4824 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4825 // we added an extra thread to this place; move to next place 4826 if (place == last_place) { 4827 place = first_place; 4828 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4829 place = 0; 4830 } else { 4831 place++; 4832 } 4833 s_count = 0; 4834 gap_ct = 1; 4835 rem--; 4836 } else if (s_count == S) { // place full; don't add extra 4837 if (place == last_place) { 4838 place = first_place; 4839 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4840 place = 0; 4841 } else { 4842 place++; 4843 } 4844 gap_ct++; 4845 s_count = 0; 4846 } 4847 4848 KA_TRACE(100, 4849 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4850 "partition = [%d,%d]\n", 4851 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4852 th->th.th_new_place, first_place, last_place)); 4853 } 4854 KMP_DEBUG_ASSERT(place == masters_place); 4855 } 4856 } break; 4857 4858 case proc_bind_spread: { 4859 int f; 4860 int n_th = team->t.t_nproc; 4861 int n_places; 4862 int thidx; 4863 if (first_place <= last_place) { 4864 n_places = last_place - first_place + 1; 4865 } else { 4866 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4867 } 4868 if (n_th <= n_places) { 4869 int place = -1; 4870 4871 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4872 int S = n_places / n_th; 4873 int s_count, rem, gap, gap_ct; 4874 4875 place = masters_place; 4876 rem = n_places - n_th * S; 4877 gap = rem ? n_th / rem : 1; 4878 gap_ct = gap; 4879 thidx = n_th; 4880 if (update_master_only == 1) 4881 thidx = 1; 4882 for (f = 0; f < thidx; f++) { 4883 kmp_info_t *th = team->t.t_threads[f]; 4884 KMP_DEBUG_ASSERT(th != NULL); 4885 4886 th->th.th_first_place = place; 4887 th->th.th_new_place = place; 4888 if (__kmp_display_affinity && place != th->th.th_current_place && 4889 team->t.t_display_affinity != 1) { 4890 team->t.t_display_affinity = 1; 4891 } 4892 s_count = 1; 4893 while (s_count < S) { 4894 if (place == last_place) { 4895 place = first_place; 4896 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4897 place = 0; 4898 } else { 4899 place++; 4900 } 4901 s_count++; 4902 } 4903 if (rem && (gap_ct == gap)) { 4904 if (place == last_place) { 4905 place = first_place; 4906 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4907 place = 0; 4908 } else { 4909 place++; 4910 } 4911 rem--; 4912 gap_ct = 0; 4913 } 4914 th->th.th_last_place = place; 4915 gap_ct++; 4916 4917 if (place == last_place) { 4918 place = first_place; 4919 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4920 place = 0; 4921 } else { 4922 place++; 4923 } 4924 4925 KA_TRACE(100, 4926 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4927 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4928 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4929 f, th->th.th_new_place, th->th.th_first_place, 4930 th->th.th_last_place, __kmp_affinity_num_masks)); 4931 } 4932 } else { 4933 /* Having uniform space of available computation places I can create 4934 T partitions of round(P/T) size and put threads into the first 4935 place of each partition. */ 4936 double current = static_cast<double>(masters_place); 4937 double spacing = 4938 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4939 int first, last; 4940 kmp_info_t *th; 4941 4942 thidx = n_th + 1; 4943 if (update_master_only == 1) 4944 thidx = 1; 4945 for (f = 0; f < thidx; f++) { 4946 first = static_cast<int>(current); 4947 last = static_cast<int>(current + spacing) - 1; 4948 KMP_DEBUG_ASSERT(last >= first); 4949 if (first >= n_places) { 4950 if (masters_place) { 4951 first -= n_places; 4952 last -= n_places; 4953 if (first == (masters_place + 1)) { 4954 KMP_DEBUG_ASSERT(f == n_th); 4955 first--; 4956 } 4957 if (last == masters_place) { 4958 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4959 last--; 4960 } 4961 } else { 4962 KMP_DEBUG_ASSERT(f == n_th); 4963 first = 0; 4964 last = 0; 4965 } 4966 } 4967 if (last >= n_places) { 4968 last = (n_places - 1); 4969 } 4970 place = first; 4971 current += spacing; 4972 if (f < n_th) { 4973 KMP_DEBUG_ASSERT(0 <= first); 4974 KMP_DEBUG_ASSERT(n_places > first); 4975 KMP_DEBUG_ASSERT(0 <= last); 4976 KMP_DEBUG_ASSERT(n_places > last); 4977 KMP_DEBUG_ASSERT(last_place >= first_place); 4978 th = team->t.t_threads[f]; 4979 KMP_DEBUG_ASSERT(th); 4980 th->th.th_first_place = first; 4981 th->th.th_new_place = place; 4982 th->th.th_last_place = last; 4983 if (__kmp_display_affinity && place != th->th.th_current_place && 4984 team->t.t_display_affinity != 1) { 4985 team->t.t_display_affinity = 1; 4986 } 4987 KA_TRACE(100, 4988 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4989 "partition = [%d,%d], spacing = %.4f\n", 4990 __kmp_gtid_from_thread(team->t.t_threads[f]), 4991 team->t.t_id, f, th->th.th_new_place, 4992 th->th.th_first_place, th->th.th_last_place, spacing)); 4993 } 4994 } 4995 } 4996 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4997 } else { 4998 int S, rem, gap, s_count; 4999 S = n_th / n_places; 5000 s_count = 0; 5001 rem = n_th - (S * n_places); 5002 gap = rem > 0 ? n_places / rem : n_places; 5003 int place = masters_place; 5004 int gap_ct = gap; 5005 thidx = n_th; 5006 if (update_master_only == 1) 5007 thidx = 1; 5008 for (f = 0; f < thidx; f++) { 5009 kmp_info_t *th = team->t.t_threads[f]; 5010 KMP_DEBUG_ASSERT(th != NULL); 5011 5012 th->th.th_first_place = place; 5013 th->th.th_last_place = place; 5014 th->th.th_new_place = place; 5015 if (__kmp_display_affinity && place != th->th.th_current_place && 5016 team->t.t_display_affinity != 1) { 5017 team->t.t_display_affinity = 1; 5018 } 5019 s_count++; 5020 5021 if ((s_count == S) && rem && (gap_ct == gap)) { 5022 // do nothing, add an extra thread to place on next iteration 5023 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 5024 // we added an extra thread to this place; move on to next place 5025 if (place == last_place) { 5026 place = first_place; 5027 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 5028 place = 0; 5029 } else { 5030 place++; 5031 } 5032 s_count = 0; 5033 gap_ct = 1; 5034 rem--; 5035 } else if (s_count == S) { // place is full; don't add extra thread 5036 if (place == last_place) { 5037 place = first_place; 5038 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 5039 place = 0; 5040 } else { 5041 place++; 5042 } 5043 gap_ct++; 5044 s_count = 0; 5045 } 5046 5047 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 5048 "partition = [%d,%d]\n", 5049 __kmp_gtid_from_thread(team->t.t_threads[f]), 5050 team->t.t_id, f, th->th.th_new_place, 5051 th->th.th_first_place, th->th.th_last_place)); 5052 } 5053 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 5054 } 5055 } break; 5056 5057 default: 5058 break; 5059 } 5060 5061 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5062 } 5063 5064 #endif // KMP_AFFINITY_SUPPORTED 5065 5066 /* allocate a new team data structure to use. take one off of the free pool if 5067 available */ 5068 kmp_team_t * 5069 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5070 #if OMPT_SUPPORT 5071 ompt_data_t ompt_parallel_data, 5072 #endif 5073 kmp_proc_bind_t new_proc_bind, 5074 kmp_internal_control_t *new_icvs, 5075 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5076 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5077 int f; 5078 kmp_team_t *team; 5079 int use_hot_team = !root->r.r_active; 5080 int level = 0; 5081 int do_place_partition = 1; 5082 5083 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5084 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5085 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5086 KMP_MB(); 5087 5088 #if KMP_NESTED_HOT_TEAMS 5089 kmp_hot_team_ptr_t *hot_teams; 5090 if (master) { 5091 team = master->th.th_team; 5092 level = team->t.t_active_level; 5093 if (master->th.th_teams_microtask) { // in teams construct? 5094 if (master->th.th_teams_size.nteams > 1 && 5095 ( // #teams > 1 5096 team->t.t_pkfn == 5097 (microtask_t)__kmp_teams_master || // inner fork of the teams 5098 master->th.th_teams_level < 5099 team->t.t_level)) { // or nested parallel inside the teams 5100 ++level; // not increment if #teams==1, or for outer fork of the teams; 5101 // increment otherwise 5102 } 5103 // Do not perform the place partition if inner fork of the teams 5104 // Wait until nested parallel region encountered inside teams construct 5105 if ((master->th.th_teams_size.nteams == 1 && 5106 master->th.th_teams_level >= team->t.t_level) || 5107 (team->t.t_pkfn == (microtask_t)__kmp_teams_master)) 5108 do_place_partition = 0; 5109 } 5110 hot_teams = master->th.th_hot_teams; 5111 if (level < __kmp_hot_teams_max_level && hot_teams && 5112 hot_teams[level].hot_team) { 5113 // hot team has already been allocated for given level 5114 use_hot_team = 1; 5115 } else { 5116 use_hot_team = 0; 5117 } 5118 } else { 5119 // check we won't access uninitialized hot_teams, just in case 5120 KMP_DEBUG_ASSERT(new_nproc == 1); 5121 } 5122 #endif 5123 // Optimization to use a "hot" team 5124 if (use_hot_team && new_nproc > 1) { 5125 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5126 #if KMP_NESTED_HOT_TEAMS 5127 team = hot_teams[level].hot_team; 5128 #else 5129 team = root->r.r_hot_team; 5130 #endif 5131 #if KMP_DEBUG 5132 if (__kmp_tasking_mode != tskm_immediate_exec) { 5133 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5134 "task_team[1] = %p before reinit\n", 5135 team->t.t_task_team[0], team->t.t_task_team[1])); 5136 } 5137 #endif 5138 5139 if (team->t.t_nproc != new_nproc && 5140 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5141 // Distributed barrier may need a resize 5142 int old_nthr = team->t.t_nproc; 5143 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5144 } 5145 5146 // If not doing the place partition, then reset the team's proc bind 5147 // to indicate that partitioning of all threads still needs to take place 5148 if (do_place_partition == 0) 5149 team->t.t_proc_bind = proc_bind_default; 5150 // Has the number of threads changed? 5151 /* Let's assume the most common case is that the number of threads is 5152 unchanged, and put that case first. */ 5153 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5154 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5155 // This case can mean that omp_set_num_threads() was called and the hot 5156 // team size was already reduced, so we check the special flag 5157 if (team->t.t_size_changed == -1) { 5158 team->t.t_size_changed = 1; 5159 } else { 5160 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5161 } 5162 5163 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5164 kmp_r_sched_t new_sched = new_icvs->sched; 5165 // set primary thread's schedule as new run-time schedule 5166 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5167 5168 __kmp_reinitialize_team(team, new_icvs, 5169 root->r.r_uber_thread->th.th_ident); 5170 5171 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5172 team->t.t_threads[0], team)); 5173 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5174 5175 #if KMP_AFFINITY_SUPPORTED 5176 if ((team->t.t_size_changed == 0) && 5177 (team->t.t_proc_bind == new_proc_bind)) { 5178 if (new_proc_bind == proc_bind_spread) { 5179 if (do_place_partition) { 5180 // add flag to update only master for spread 5181 __kmp_partition_places(team, 1); 5182 } 5183 } 5184 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5185 "proc_bind = %d, partition = [%d,%d]\n", 5186 team->t.t_id, new_proc_bind, team->t.t_first_place, 5187 team->t.t_last_place)); 5188 } else { 5189 if (do_place_partition) { 5190 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5191 __kmp_partition_places(team); 5192 } 5193 } 5194 #else 5195 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5196 #endif /* KMP_AFFINITY_SUPPORTED */ 5197 } else if (team->t.t_nproc > new_nproc) { 5198 KA_TRACE(20, 5199 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5200 new_nproc)); 5201 5202 team->t.t_size_changed = 1; 5203 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5204 // Barrier size already reduced earlier in this function 5205 // Activate team threads via th_used_in_team 5206 __kmp_add_threads_to_team(team, new_nproc); 5207 } 5208 #if KMP_NESTED_HOT_TEAMS 5209 if (__kmp_hot_teams_mode == 0) { 5210 // AC: saved number of threads should correspond to team's value in this 5211 // mode, can be bigger in mode 1, when hot team has threads in reserve 5212 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5213 hot_teams[level].hot_team_nth = new_nproc; 5214 #endif // KMP_NESTED_HOT_TEAMS 5215 /* release the extra threads we don't need any more */ 5216 for (f = new_nproc; f < team->t.t_nproc; f++) { 5217 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5218 if (__kmp_tasking_mode != tskm_immediate_exec) { 5219 // When decreasing team size, threads no longer in the team should 5220 // unref task team. 5221 team->t.t_threads[f]->th.th_task_team = NULL; 5222 } 5223 __kmp_free_thread(team->t.t_threads[f]); 5224 team->t.t_threads[f] = NULL; 5225 } 5226 #if KMP_NESTED_HOT_TEAMS 5227 } // (__kmp_hot_teams_mode == 0) 5228 else { 5229 // When keeping extra threads in team, switch threads to wait on own 5230 // b_go flag 5231 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5232 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5233 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5234 for (int b = 0; b < bs_last_barrier; ++b) { 5235 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5236 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5237 } 5238 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5239 } 5240 } 5241 } 5242 #endif // KMP_NESTED_HOT_TEAMS 5243 team->t.t_nproc = new_nproc; 5244 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5245 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5246 __kmp_reinitialize_team(team, new_icvs, 5247 root->r.r_uber_thread->th.th_ident); 5248 5249 // Update remaining threads 5250 for (f = 0; f < new_nproc; ++f) { 5251 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5252 } 5253 5254 // restore the current task state of the primary thread: should be the 5255 // implicit task 5256 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5257 team->t.t_threads[0], team)); 5258 5259 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5260 5261 #ifdef KMP_DEBUG 5262 for (f = 0; f < team->t.t_nproc; f++) { 5263 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5264 team->t.t_threads[f]->th.th_team_nproc == 5265 team->t.t_nproc); 5266 } 5267 #endif 5268 5269 if (do_place_partition) { 5270 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5271 #if KMP_AFFINITY_SUPPORTED 5272 __kmp_partition_places(team); 5273 #endif 5274 } 5275 } else { // team->t.t_nproc < new_nproc 5276 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5277 kmp_affin_mask_t *old_mask; 5278 if (KMP_AFFINITY_CAPABLE()) { 5279 KMP_CPU_ALLOC(old_mask); 5280 } 5281 #endif 5282 5283 KA_TRACE(20, 5284 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5285 new_nproc)); 5286 int old_nproc = team->t.t_nproc; // save old value and use to update only 5287 team->t.t_size_changed = 1; 5288 5289 #if KMP_NESTED_HOT_TEAMS 5290 int avail_threads = hot_teams[level].hot_team_nth; 5291 if (new_nproc < avail_threads) 5292 avail_threads = new_nproc; 5293 kmp_info_t **other_threads = team->t.t_threads; 5294 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5295 // Adjust barrier data of reserved threads (if any) of the team 5296 // Other data will be set in __kmp_initialize_info() below. 5297 int b; 5298 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5299 for (b = 0; b < bs_last_barrier; ++b) { 5300 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5301 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5302 #if USE_DEBUGGER 5303 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5304 #endif 5305 } 5306 } 5307 if (hot_teams[level].hot_team_nth >= new_nproc) { 5308 // we have all needed threads in reserve, no need to allocate any 5309 // this only possible in mode 1, cannot have reserved threads in mode 0 5310 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5311 team->t.t_nproc = new_nproc; // just get reserved threads involved 5312 } else { 5313 // We may have some threads in reserve, but not enough; 5314 // get reserved threads involved if any. 5315 team->t.t_nproc = hot_teams[level].hot_team_nth; 5316 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5317 #endif // KMP_NESTED_HOT_TEAMS 5318 if (team->t.t_max_nproc < new_nproc) { 5319 /* reallocate larger arrays */ 5320 __kmp_reallocate_team_arrays(team, new_nproc); 5321 __kmp_reinitialize_team(team, new_icvs, NULL); 5322 } 5323 5324 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5325 /* Temporarily set full mask for primary thread before creation of 5326 workers. The reason is that workers inherit the affinity from the 5327 primary thread, so if a lot of workers are created on the single 5328 core quickly, they don't get a chance to set their own affinity for 5329 a long time. */ 5330 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5331 #endif 5332 5333 /* allocate new threads for the hot team */ 5334 for (f = team->t.t_nproc; f < new_nproc; f++) { 5335 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5336 KMP_DEBUG_ASSERT(new_worker); 5337 team->t.t_threads[f] = new_worker; 5338 5339 KA_TRACE(20, 5340 ("__kmp_allocate_team: team %d init T#%d arrived: " 5341 "join=%llu, plain=%llu\n", 5342 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5343 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5344 team->t.t_bar[bs_plain_barrier].b_arrived)); 5345 5346 { // Initialize barrier data for new threads. 5347 int b; 5348 kmp_balign_t *balign = new_worker->th.th_bar; 5349 for (b = 0; b < bs_last_barrier; ++b) { 5350 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5351 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5352 KMP_BARRIER_PARENT_FLAG); 5353 #if USE_DEBUGGER 5354 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5355 #endif 5356 } 5357 } 5358 } 5359 5360 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5361 if (KMP_AFFINITY_CAPABLE()) { 5362 /* Restore initial primary thread's affinity mask */ 5363 __kmp_set_system_affinity(old_mask, TRUE); 5364 KMP_CPU_FREE(old_mask); 5365 } 5366 #endif 5367 #if KMP_NESTED_HOT_TEAMS 5368 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5369 #endif // KMP_NESTED_HOT_TEAMS 5370 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5371 // Barrier size already increased earlier in this function 5372 // Activate team threads via th_used_in_team 5373 __kmp_add_threads_to_team(team, new_nproc); 5374 } 5375 /* make sure everyone is syncronized */ 5376 // new threads below 5377 __kmp_initialize_team(team, new_nproc, new_icvs, 5378 root->r.r_uber_thread->th.th_ident); 5379 5380 /* reinitialize the threads */ 5381 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5382 for (f = 0; f < team->t.t_nproc; ++f) 5383 __kmp_initialize_info(team->t.t_threads[f], team, f, 5384 __kmp_gtid_from_tid(f, team)); 5385 5386 if (level) { // set th_task_state for new threads in nested hot team 5387 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5388 // only need to set the th_task_state for the new threads. th_task_state 5389 // for primary thread will not be accurate until after this in 5390 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5391 // get the correct value. 5392 for (f = old_nproc; f < team->t.t_nproc; ++f) 5393 team->t.t_threads[f]->th.th_task_state = 5394 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5395 } else { // set th_task_state for new threads in non-nested hot team 5396 // copy primary thread's state 5397 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5398 for (f = old_nproc; f < team->t.t_nproc; ++f) 5399 team->t.t_threads[f]->th.th_task_state = old_state; 5400 } 5401 5402 #ifdef KMP_DEBUG 5403 for (f = 0; f < team->t.t_nproc; ++f) { 5404 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5405 team->t.t_threads[f]->th.th_team_nproc == 5406 team->t.t_nproc); 5407 } 5408 #endif 5409 5410 if (do_place_partition) { 5411 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5412 #if KMP_AFFINITY_SUPPORTED 5413 __kmp_partition_places(team); 5414 #endif 5415 } 5416 } // Check changes in number of threads 5417 5418 kmp_info_t *master = team->t.t_threads[0]; 5419 if (master->th.th_teams_microtask) { 5420 for (f = 1; f < new_nproc; ++f) { 5421 // propagate teams construct specific info to workers 5422 kmp_info_t *thr = team->t.t_threads[f]; 5423 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5424 thr->th.th_teams_level = master->th.th_teams_level; 5425 thr->th.th_teams_size = master->th.th_teams_size; 5426 } 5427 } 5428 #if KMP_NESTED_HOT_TEAMS 5429 if (level) { 5430 // Sync barrier state for nested hot teams, not needed for outermost hot 5431 // team. 5432 for (f = 1; f < new_nproc; ++f) { 5433 kmp_info_t *thr = team->t.t_threads[f]; 5434 int b; 5435 kmp_balign_t *balign = thr->th.th_bar; 5436 for (b = 0; b < bs_last_barrier; ++b) { 5437 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5438 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5439 #if USE_DEBUGGER 5440 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5441 #endif 5442 } 5443 } 5444 } 5445 #endif // KMP_NESTED_HOT_TEAMS 5446 5447 /* reallocate space for arguments if necessary */ 5448 __kmp_alloc_argv_entries(argc, team, TRUE); 5449 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5450 // The hot team re-uses the previous task team, 5451 // if untouched during the previous release->gather phase. 5452 5453 KF_TRACE(10, (" hot_team = %p\n", team)); 5454 5455 #if KMP_DEBUG 5456 if (__kmp_tasking_mode != tskm_immediate_exec) { 5457 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5458 "task_team[1] = %p after reinit\n", 5459 team->t.t_task_team[0], team->t.t_task_team[1])); 5460 } 5461 #endif 5462 5463 #if OMPT_SUPPORT 5464 __ompt_team_assign_id(team, ompt_parallel_data); 5465 #endif 5466 5467 KMP_MB(); 5468 5469 return team; 5470 } 5471 5472 /* next, let's try to take one from the team pool */ 5473 KMP_MB(); 5474 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5475 /* TODO: consider resizing undersized teams instead of reaping them, now 5476 that we have a resizing mechanism */ 5477 if (team->t.t_max_nproc >= max_nproc) { 5478 /* take this team from the team pool */ 5479 __kmp_team_pool = team->t.t_next_pool; 5480 5481 if (max_nproc > 1 && 5482 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5483 if (!team->t.b) { // Allocate barrier structure 5484 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5485 } 5486 } 5487 5488 /* setup the team for fresh use */ 5489 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5490 5491 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5492 "task_team[1] %p to NULL\n", 5493 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5494 team->t.t_task_team[0] = NULL; 5495 team->t.t_task_team[1] = NULL; 5496 5497 /* reallocate space for arguments if necessary */ 5498 __kmp_alloc_argv_entries(argc, team, TRUE); 5499 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5500 5501 KA_TRACE( 5502 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5503 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5504 { // Initialize barrier data. 5505 int b; 5506 for (b = 0; b < bs_last_barrier; ++b) { 5507 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5508 #if USE_DEBUGGER 5509 team->t.t_bar[b].b_master_arrived = 0; 5510 team->t.t_bar[b].b_team_arrived = 0; 5511 #endif 5512 } 5513 } 5514 5515 team->t.t_proc_bind = new_proc_bind; 5516 5517 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5518 team->t.t_id)); 5519 5520 #if OMPT_SUPPORT 5521 __ompt_team_assign_id(team, ompt_parallel_data); 5522 #endif 5523 5524 KMP_MB(); 5525 5526 return team; 5527 } 5528 5529 /* reap team if it is too small, then loop back and check the next one */ 5530 // not sure if this is wise, but, will be redone during the hot-teams 5531 // rewrite. 5532 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5533 team = __kmp_reap_team(team); 5534 __kmp_team_pool = team; 5535 } 5536 5537 /* nothing available in the pool, no matter, make a new team! */ 5538 KMP_MB(); 5539 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5540 5541 /* and set it up */ 5542 team->t.t_max_nproc = max_nproc; 5543 if (max_nproc > 1 && 5544 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5545 // Allocate barrier structure 5546 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5547 } 5548 5549 /* NOTE well, for some reason allocating one big buffer and dividing it up 5550 seems to really hurt performance a lot on the P4, so, let's not use this */ 5551 __kmp_allocate_team_arrays(team, max_nproc); 5552 5553 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5554 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5555 5556 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5557 "%p to NULL\n", 5558 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5559 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5560 // memory, no need to duplicate 5561 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5562 // memory, no need to duplicate 5563 5564 if (__kmp_storage_map) { 5565 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5566 } 5567 5568 /* allocate space for arguments */ 5569 __kmp_alloc_argv_entries(argc, team, FALSE); 5570 team->t.t_argc = argc; 5571 5572 KA_TRACE(20, 5573 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5574 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5575 { // Initialize barrier data. 5576 int b; 5577 for (b = 0; b < bs_last_barrier; ++b) { 5578 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5579 #if USE_DEBUGGER 5580 team->t.t_bar[b].b_master_arrived = 0; 5581 team->t.t_bar[b].b_team_arrived = 0; 5582 #endif 5583 } 5584 } 5585 5586 team->t.t_proc_bind = new_proc_bind; 5587 5588 #if OMPT_SUPPORT 5589 __ompt_team_assign_id(team, ompt_parallel_data); 5590 team->t.ompt_serialized_team_info = NULL; 5591 #endif 5592 5593 KMP_MB(); 5594 5595 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5596 team->t.t_id)); 5597 5598 return team; 5599 } 5600 5601 /* TODO implement hot-teams at all levels */ 5602 /* TODO implement lazy thread release on demand (disband request) */ 5603 5604 /* free the team. return it to the team pool. release all the threads 5605 * associated with it */ 5606 void __kmp_free_team(kmp_root_t *root, 5607 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5608 int f; 5609 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5610 team->t.t_id)); 5611 5612 /* verify state */ 5613 KMP_DEBUG_ASSERT(root); 5614 KMP_DEBUG_ASSERT(team); 5615 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5616 KMP_DEBUG_ASSERT(team->t.t_threads); 5617 5618 int use_hot_team = team == root->r.r_hot_team; 5619 #if KMP_NESTED_HOT_TEAMS 5620 int level; 5621 if (master) { 5622 level = team->t.t_active_level - 1; 5623 if (master->th.th_teams_microtask) { // in teams construct? 5624 if (master->th.th_teams_size.nteams > 1) { 5625 ++level; // level was not increased in teams construct for 5626 // team_of_masters 5627 } 5628 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5629 master->th.th_teams_level == team->t.t_level) { 5630 ++level; // level was not increased in teams construct for 5631 // team_of_workers before the parallel 5632 } // team->t.t_level will be increased inside parallel 5633 } 5634 #if KMP_DEBUG 5635 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5636 #endif 5637 if (level < __kmp_hot_teams_max_level) { 5638 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5639 use_hot_team = 1; 5640 } 5641 } 5642 #endif // KMP_NESTED_HOT_TEAMS 5643 5644 /* team is done working */ 5645 TCW_SYNC_PTR(team->t.t_pkfn, 5646 NULL); // Important for Debugging Support Library. 5647 #if KMP_OS_WINDOWS 5648 team->t.t_copyin_counter = 0; // init counter for possible reuse 5649 #endif 5650 // Do not reset pointer to parent team to NULL for hot teams. 5651 5652 /* if we are non-hot team, release our threads */ 5653 if (!use_hot_team) { 5654 if (__kmp_tasking_mode != tskm_immediate_exec) { 5655 // Wait for threads to reach reapable state 5656 for (f = 1; f < team->t.t_nproc; ++f) { 5657 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5658 kmp_info_t *th = team->t.t_threads[f]; 5659 volatile kmp_uint32 *state = &th->th.th_reap_state; 5660 while (*state != KMP_SAFE_TO_REAP) { 5661 #if KMP_OS_WINDOWS 5662 // On Windows a thread can be killed at any time, check this 5663 DWORD ecode; 5664 if (!__kmp_is_thread_alive(th, &ecode)) { 5665 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5666 break; 5667 } 5668 #endif 5669 // first check if thread is sleeping 5670 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5671 if (fl.is_sleeping()) 5672 fl.resume(__kmp_gtid_from_thread(th)); 5673 KMP_CPU_PAUSE(); 5674 } 5675 } 5676 5677 // Delete task teams 5678 int tt_idx; 5679 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5680 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5681 if (task_team != NULL) { 5682 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5683 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5684 team->t.t_threads[f]->th.th_task_team = NULL; 5685 } 5686 KA_TRACE( 5687 20, 5688 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5689 __kmp_get_gtid(), task_team, team->t.t_id)); 5690 #if KMP_NESTED_HOT_TEAMS 5691 __kmp_free_task_team(master, task_team); 5692 #endif 5693 team->t.t_task_team[tt_idx] = NULL; 5694 } 5695 } 5696 } 5697 5698 // Reset pointer to parent team only for non-hot teams. 5699 team->t.t_parent = NULL; 5700 team->t.t_level = 0; 5701 team->t.t_active_level = 0; 5702 5703 /* free the worker threads */ 5704 for (f = 1; f < team->t.t_nproc; ++f) { 5705 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5706 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5707 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5708 1, 2); 5709 } 5710 __kmp_free_thread(team->t.t_threads[f]); 5711 } 5712 5713 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5714 if (team->t.b) { 5715 // wake up thread at old location 5716 team->t.b->go_release(); 5717 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5718 for (f = 1; f < team->t.t_nproc; ++f) { 5719 if (team->t.b->sleep[f].sleep) { 5720 __kmp_atomic_resume_64( 5721 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5722 (kmp_atomic_flag_64<> *)NULL); 5723 } 5724 } 5725 } 5726 // Wait for threads to be removed from team 5727 for (int f = 1; f < team->t.t_nproc; ++f) { 5728 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5729 KMP_CPU_PAUSE(); 5730 } 5731 } 5732 } 5733 5734 for (f = 1; f < team->t.t_nproc; ++f) { 5735 team->t.t_threads[f] = NULL; 5736 } 5737 5738 if (team->t.t_max_nproc > 1 && 5739 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5740 distributedBarrier::deallocate(team->t.b); 5741 team->t.b = NULL; 5742 } 5743 /* put the team back in the team pool */ 5744 /* TODO limit size of team pool, call reap_team if pool too large */ 5745 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5746 __kmp_team_pool = (volatile kmp_team_t *)team; 5747 } else { // Check if team was created for primary threads in teams construct 5748 // See if first worker is a CG root 5749 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5750 team->t.t_threads[1]->th.th_cg_roots); 5751 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5752 // Clean up the CG root nodes on workers so that this team can be re-used 5753 for (f = 1; f < team->t.t_nproc; ++f) { 5754 kmp_info_t *thr = team->t.t_threads[f]; 5755 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5756 thr->th.th_cg_roots->cg_root == thr); 5757 // Pop current CG root off list 5758 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5759 thr->th.th_cg_roots = tmp->up; 5760 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5761 " up to node %p. cg_nthreads was %d\n", 5762 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5763 int i = tmp->cg_nthreads--; 5764 if (i == 1) { 5765 __kmp_free(tmp); // free CG if we are the last thread in it 5766 } 5767 // Restore current task's thread_limit from CG root 5768 if (thr->th.th_cg_roots) 5769 thr->th.th_current_task->td_icvs.thread_limit = 5770 thr->th.th_cg_roots->cg_thread_limit; 5771 } 5772 } 5773 } 5774 5775 KMP_MB(); 5776 } 5777 5778 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5779 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5780 kmp_team_t *next_pool = team->t.t_next_pool; 5781 5782 KMP_DEBUG_ASSERT(team); 5783 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5784 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5785 KMP_DEBUG_ASSERT(team->t.t_threads); 5786 KMP_DEBUG_ASSERT(team->t.t_argv); 5787 5788 /* TODO clean the threads that are a part of this? */ 5789 5790 /* free stuff */ 5791 __kmp_free_team_arrays(team); 5792 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5793 __kmp_free((void *)team->t.t_argv); 5794 __kmp_free(team); 5795 5796 KMP_MB(); 5797 return next_pool; 5798 } 5799 5800 // Free the thread. Don't reap it, just place it on the pool of available 5801 // threads. 5802 // 5803 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5804 // binding for the affinity mechanism to be useful. 5805 // 5806 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5807 // However, we want to avoid a potential performance problem by always 5808 // scanning through the list to find the correct point at which to insert 5809 // the thread (potential N**2 behavior). To do this we keep track of the 5810 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5811 // With single-level parallelism, threads will always be added to the tail 5812 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5813 // parallelism, all bets are off and we may need to scan through the entire 5814 // free list. 5815 // 5816 // This change also has a potentially large performance benefit, for some 5817 // applications. Previously, as threads were freed from the hot team, they 5818 // would be placed back on the free list in inverse order. If the hot team 5819 // grew back to it's original size, then the freed thread would be placed 5820 // back on the hot team in reverse order. This could cause bad cache 5821 // locality problems on programs where the size of the hot team regularly 5822 // grew and shrunk. 5823 // 5824 // Now, for single-level parallelism, the OMP tid is always == gtid. 5825 void __kmp_free_thread(kmp_info_t *this_th) { 5826 int gtid; 5827 kmp_info_t **scan; 5828 5829 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5830 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5831 5832 KMP_DEBUG_ASSERT(this_th); 5833 5834 // When moving thread to pool, switch thread to wait on own b_go flag, and 5835 // uninitialized (NULL team). 5836 int b; 5837 kmp_balign_t *balign = this_th->th.th_bar; 5838 for (b = 0; b < bs_last_barrier; ++b) { 5839 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5840 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5841 balign[b].bb.team = NULL; 5842 balign[b].bb.leaf_kids = 0; 5843 } 5844 this_th->th.th_task_state = 0; 5845 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5846 5847 /* put thread back on the free pool */ 5848 TCW_PTR(this_th->th.th_team, NULL); 5849 TCW_PTR(this_th->th.th_root, NULL); 5850 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5851 5852 while (this_th->th.th_cg_roots) { 5853 this_th->th.th_cg_roots->cg_nthreads--; 5854 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5855 " %p of thread %p to %d\n", 5856 this_th, this_th->th.th_cg_roots, 5857 this_th->th.th_cg_roots->cg_root, 5858 this_th->th.th_cg_roots->cg_nthreads)); 5859 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5860 if (tmp->cg_root == this_th) { // Thread is a cg_root 5861 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5862 KA_TRACE( 5863 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5864 this_th->th.th_cg_roots = tmp->up; 5865 __kmp_free(tmp); 5866 } else { // Worker thread 5867 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5868 __kmp_free(tmp); 5869 } 5870 this_th->th.th_cg_roots = NULL; 5871 break; 5872 } 5873 } 5874 5875 /* If the implicit task assigned to this thread can be used by other threads 5876 * -> multiple threads can share the data and try to free the task at 5877 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5878 * with higher probability when hot team is disabled but can occurs even when 5879 * the hot team is enabled */ 5880 __kmp_free_implicit_task(this_th); 5881 this_th->th.th_current_task = NULL; 5882 5883 // If the __kmp_thread_pool_insert_pt is already past the new insert 5884 // point, then we need to re-scan the entire list. 5885 gtid = this_th->th.th_info.ds.ds_gtid; 5886 if (__kmp_thread_pool_insert_pt != NULL) { 5887 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5888 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5889 __kmp_thread_pool_insert_pt = NULL; 5890 } 5891 } 5892 5893 // Scan down the list to find the place to insert the thread. 5894 // scan is the address of a link in the list, possibly the address of 5895 // __kmp_thread_pool itself. 5896 // 5897 // In the absence of nested parallelism, the for loop will have 0 iterations. 5898 if (__kmp_thread_pool_insert_pt != NULL) { 5899 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5900 } else { 5901 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5902 } 5903 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5904 scan = &((*scan)->th.th_next_pool)) 5905 ; 5906 5907 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5908 // to its address. 5909 TCW_PTR(this_th->th.th_next_pool, *scan); 5910 __kmp_thread_pool_insert_pt = *scan = this_th; 5911 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5912 (this_th->th.th_info.ds.ds_gtid < 5913 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5914 TCW_4(this_th->th.th_in_pool, TRUE); 5915 __kmp_suspend_initialize_thread(this_th); 5916 __kmp_lock_suspend_mx(this_th); 5917 if (this_th->th.th_active == TRUE) { 5918 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5919 this_th->th.th_active_in_pool = TRUE; 5920 } 5921 #if KMP_DEBUG 5922 else { 5923 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5924 } 5925 #endif 5926 __kmp_unlock_suspend_mx(this_th); 5927 5928 TCW_4(__kmp_nth, __kmp_nth - 1); 5929 5930 #ifdef KMP_ADJUST_BLOCKTIME 5931 /* Adjust blocktime back to user setting or default if necessary */ 5932 /* Middle initialization might never have occurred */ 5933 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5934 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5935 if (__kmp_nth <= __kmp_avail_proc) { 5936 __kmp_zero_bt = FALSE; 5937 } 5938 } 5939 #endif /* KMP_ADJUST_BLOCKTIME */ 5940 5941 KMP_MB(); 5942 } 5943 5944 /* ------------------------------------------------------------------------ */ 5945 5946 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5947 #if OMP_PROFILING_SUPPORT 5948 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5949 // TODO: add a configuration option for time granularity 5950 if (ProfileTraceFile) 5951 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5952 #endif 5953 5954 int gtid = this_thr->th.th_info.ds.ds_gtid; 5955 /* void *stack_data;*/ 5956 kmp_team_t **volatile pteam; 5957 5958 KMP_MB(); 5959 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5960 5961 if (__kmp_env_consistency_check) { 5962 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5963 } 5964 5965 #if OMPD_SUPPORT 5966 if (ompd_state & OMPD_ENABLE_BP) 5967 ompd_bp_thread_begin(); 5968 #endif 5969 5970 #if OMPT_SUPPORT 5971 ompt_data_t *thread_data = nullptr; 5972 if (ompt_enabled.enabled) { 5973 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5974 *thread_data = ompt_data_none; 5975 5976 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5977 this_thr->th.ompt_thread_info.wait_id = 0; 5978 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5979 this_thr->th.ompt_thread_info.parallel_flags = 0; 5980 if (ompt_enabled.ompt_callback_thread_begin) { 5981 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5982 ompt_thread_worker, thread_data); 5983 } 5984 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5985 } 5986 #endif 5987 5988 /* This is the place where threads wait for work */ 5989 while (!TCR_4(__kmp_global.g.g_done)) { 5990 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5991 KMP_MB(); 5992 5993 /* wait for work to do */ 5994 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5995 5996 /* No tid yet since not part of a team */ 5997 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5998 5999 #if OMPT_SUPPORT 6000 if (ompt_enabled.enabled) { 6001 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6002 } 6003 #endif 6004 6005 pteam = &this_thr->th.th_team; 6006 6007 /* have we been allocated? */ 6008 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 6009 /* we were just woken up, so run our new task */ 6010 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 6011 int rc; 6012 KA_TRACE(20, 6013 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 6014 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6015 (*pteam)->t.t_pkfn)); 6016 6017 updateHWFPControl(*pteam); 6018 6019 #if OMPT_SUPPORT 6020 if (ompt_enabled.enabled) { 6021 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 6022 } 6023 #endif 6024 6025 rc = (*pteam)->t.t_invoke(gtid); 6026 KMP_ASSERT(rc); 6027 6028 KMP_MB(); 6029 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 6030 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 6031 (*pteam)->t.t_pkfn)); 6032 } 6033 #if OMPT_SUPPORT 6034 if (ompt_enabled.enabled) { 6035 /* no frame set while outside task */ 6036 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 6037 6038 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 6039 } 6040 #endif 6041 /* join barrier after parallel region */ 6042 __kmp_join_barrier(gtid); 6043 } 6044 } 6045 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 6046 6047 #if OMPD_SUPPORT 6048 if (ompd_state & OMPD_ENABLE_BP) 6049 ompd_bp_thread_end(); 6050 #endif 6051 6052 #if OMPT_SUPPORT 6053 if (ompt_enabled.ompt_callback_thread_end) { 6054 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 6055 } 6056 #endif 6057 6058 this_thr->th.th_task_team = NULL; 6059 /* run the destructors for the threadprivate data for this thread */ 6060 __kmp_common_destroy_gtid(gtid); 6061 6062 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 6063 KMP_MB(); 6064 6065 #if OMP_PROFILING_SUPPORT 6066 llvm::timeTraceProfilerFinishThread(); 6067 #endif 6068 return this_thr; 6069 } 6070 6071 /* ------------------------------------------------------------------------ */ 6072 6073 void __kmp_internal_end_dest(void *specific_gtid) { 6074 // Make sure no significant bits are lost 6075 int gtid; 6076 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 6077 6078 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 6079 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 6080 * this is because 0 is reserved for the nothing-stored case */ 6081 6082 __kmp_internal_end_thread(gtid); 6083 } 6084 6085 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6086 6087 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6088 __kmp_internal_end_atexit(); 6089 } 6090 6091 #endif 6092 6093 /* [Windows] josh: when the atexit handler is called, there may still be more 6094 than one thread alive */ 6095 void __kmp_internal_end_atexit(void) { 6096 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6097 /* [Windows] 6098 josh: ideally, we want to completely shutdown the library in this atexit 6099 handler, but stat code that depends on thread specific data for gtid fails 6100 because that data becomes unavailable at some point during the shutdown, so 6101 we call __kmp_internal_end_thread instead. We should eventually remove the 6102 dependency on __kmp_get_specific_gtid in the stat code and use 6103 __kmp_internal_end_library to cleanly shutdown the library. 6104 6105 // TODO: Can some of this comment about GVS be removed? 6106 I suspect that the offending stat code is executed when the calling thread 6107 tries to clean up a dead root thread's data structures, resulting in GVS 6108 code trying to close the GVS structures for that thread, but since the stat 6109 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6110 the calling thread is cleaning up itself instead of another thread, it get 6111 confused. This happens because allowing a thread to unregister and cleanup 6112 another thread is a recent modification for addressing an issue. 6113 Based on the current design (20050722), a thread may end up 6114 trying to unregister another thread only if thread death does not trigger 6115 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6116 thread specific data destructor function to detect thread death. For 6117 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6118 is nothing. Thus, the workaround is applicable only for Windows static 6119 stat library. */ 6120 __kmp_internal_end_library(-1); 6121 #if KMP_OS_WINDOWS 6122 __kmp_close_console(); 6123 #endif 6124 } 6125 6126 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6127 // It is assumed __kmp_forkjoin_lock is acquired. 6128 6129 int gtid; 6130 6131 KMP_DEBUG_ASSERT(thread != NULL); 6132 6133 gtid = thread->th.th_info.ds.ds_gtid; 6134 6135 if (!is_root) { 6136 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6137 /* Assume the threads are at the fork barrier here */ 6138 KA_TRACE( 6139 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6140 gtid)); 6141 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6142 while ( 6143 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6144 KMP_CPU_PAUSE(); 6145 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6146 } else { 6147 /* Need release fence here to prevent seg faults for tree forkjoin 6148 barrier (GEH) */ 6149 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6150 thread); 6151 __kmp_release_64(&flag); 6152 } 6153 } 6154 6155 // Terminate OS thread. 6156 __kmp_reap_worker(thread); 6157 6158 // The thread was killed asynchronously. If it was actively 6159 // spinning in the thread pool, decrement the global count. 6160 // 6161 // There is a small timing hole here - if the worker thread was just waking 6162 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6163 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6164 // the global counter might not get updated. 6165 // 6166 // Currently, this can only happen as the library is unloaded, 6167 // so there are no harmful side effects. 6168 if (thread->th.th_active_in_pool) { 6169 thread->th.th_active_in_pool = FALSE; 6170 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6171 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6172 } 6173 } 6174 6175 __kmp_free_implicit_task(thread); 6176 6177 // Free the fast memory for tasking 6178 #if USE_FAST_MEMORY 6179 __kmp_free_fast_memory(thread); 6180 #endif /* USE_FAST_MEMORY */ 6181 6182 __kmp_suspend_uninitialize_thread(thread); 6183 6184 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6185 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6186 6187 --__kmp_all_nth; 6188 // __kmp_nth was decremented when thread is added to the pool. 6189 6190 #ifdef KMP_ADJUST_BLOCKTIME 6191 /* Adjust blocktime back to user setting or default if necessary */ 6192 /* Middle initialization might never have occurred */ 6193 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6194 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6195 if (__kmp_nth <= __kmp_avail_proc) { 6196 __kmp_zero_bt = FALSE; 6197 } 6198 } 6199 #endif /* KMP_ADJUST_BLOCKTIME */ 6200 6201 /* free the memory being used */ 6202 if (__kmp_env_consistency_check) { 6203 if (thread->th.th_cons) { 6204 __kmp_free_cons_stack(thread->th.th_cons); 6205 thread->th.th_cons = NULL; 6206 } 6207 } 6208 6209 if (thread->th.th_pri_common != NULL) { 6210 __kmp_free(thread->th.th_pri_common); 6211 thread->th.th_pri_common = NULL; 6212 } 6213 6214 if (thread->th.th_task_state_memo_stack != NULL) { 6215 __kmp_free(thread->th.th_task_state_memo_stack); 6216 thread->th.th_task_state_memo_stack = NULL; 6217 } 6218 6219 #if KMP_USE_BGET 6220 if (thread->th.th_local.bget_data != NULL) { 6221 __kmp_finalize_bget(thread); 6222 } 6223 #endif 6224 6225 #if KMP_AFFINITY_SUPPORTED 6226 if (thread->th.th_affin_mask != NULL) { 6227 KMP_CPU_FREE(thread->th.th_affin_mask); 6228 thread->th.th_affin_mask = NULL; 6229 } 6230 #endif /* KMP_AFFINITY_SUPPORTED */ 6231 6232 #if KMP_USE_HIER_SCHED 6233 if (thread->th.th_hier_bar_data != NULL) { 6234 __kmp_free(thread->th.th_hier_bar_data); 6235 thread->th.th_hier_bar_data = NULL; 6236 } 6237 #endif 6238 6239 __kmp_reap_team(thread->th.th_serial_team); 6240 thread->th.th_serial_team = NULL; 6241 __kmp_free(thread); 6242 6243 KMP_MB(); 6244 6245 } // __kmp_reap_thread 6246 6247 static void __kmp_itthash_clean(kmp_info_t *th) { 6248 #if USE_ITT_NOTIFY 6249 if (__kmp_itt_region_domains.count > 0) { 6250 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6251 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6252 while (bucket) { 6253 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6254 __kmp_thread_free(th, bucket); 6255 bucket = next; 6256 } 6257 } 6258 } 6259 if (__kmp_itt_barrier_domains.count > 0) { 6260 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6261 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6262 while (bucket) { 6263 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6264 __kmp_thread_free(th, bucket); 6265 bucket = next; 6266 } 6267 } 6268 } 6269 #endif 6270 } 6271 6272 static void __kmp_internal_end(void) { 6273 int i; 6274 6275 /* First, unregister the library */ 6276 __kmp_unregister_library(); 6277 6278 #if KMP_OS_WINDOWS 6279 /* In Win static library, we can't tell when a root actually dies, so we 6280 reclaim the data structures for any root threads that have died but not 6281 unregistered themselves, in order to shut down cleanly. 6282 In Win dynamic library we also can't tell when a thread dies. */ 6283 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6284 // dead roots 6285 #endif 6286 6287 for (i = 0; i < __kmp_threads_capacity; i++) 6288 if (__kmp_root[i]) 6289 if (__kmp_root[i]->r.r_active) 6290 break; 6291 KMP_MB(); /* Flush all pending memory write invalidates. */ 6292 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6293 6294 if (i < __kmp_threads_capacity) { 6295 #if KMP_USE_MONITOR 6296 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6297 KMP_MB(); /* Flush all pending memory write invalidates. */ 6298 6299 // Need to check that monitor was initialized before reaping it. If we are 6300 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6301 // __kmp_monitor will appear to contain valid data, but it is only valid in 6302 // the parent process, not the child. 6303 // New behavior (201008): instead of keying off of the flag 6304 // __kmp_init_parallel, the monitor thread creation is keyed off 6305 // of the new flag __kmp_init_monitor. 6306 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6307 if (TCR_4(__kmp_init_monitor)) { 6308 __kmp_reap_monitor(&__kmp_monitor); 6309 TCW_4(__kmp_init_monitor, 0); 6310 } 6311 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6312 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6313 #endif // KMP_USE_MONITOR 6314 } else { 6315 /* TODO move this to cleanup code */ 6316 #ifdef KMP_DEBUG 6317 /* make sure that everything has properly ended */ 6318 for (i = 0; i < __kmp_threads_capacity; i++) { 6319 if (__kmp_root[i]) { 6320 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6321 // there can be uber threads alive here 6322 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6323 } 6324 } 6325 #endif 6326 6327 KMP_MB(); 6328 6329 // Reap the worker threads. 6330 // This is valid for now, but be careful if threads are reaped sooner. 6331 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6332 // Get the next thread from the pool. 6333 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6334 __kmp_thread_pool = thread->th.th_next_pool; 6335 // Reap it. 6336 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6337 thread->th.th_next_pool = NULL; 6338 thread->th.th_in_pool = FALSE; 6339 __kmp_reap_thread(thread, 0); 6340 } 6341 __kmp_thread_pool_insert_pt = NULL; 6342 6343 // Reap teams. 6344 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6345 // Get the next team from the pool. 6346 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6347 __kmp_team_pool = team->t.t_next_pool; 6348 // Reap it. 6349 team->t.t_next_pool = NULL; 6350 __kmp_reap_team(team); 6351 } 6352 6353 __kmp_reap_task_teams(); 6354 6355 #if KMP_OS_UNIX 6356 // Threads that are not reaped should not access any resources since they 6357 // are going to be deallocated soon, so the shutdown sequence should wait 6358 // until all threads either exit the final spin-waiting loop or begin 6359 // sleeping after the given blocktime. 6360 for (i = 0; i < __kmp_threads_capacity; i++) { 6361 kmp_info_t *thr = __kmp_threads[i]; 6362 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6363 KMP_CPU_PAUSE(); 6364 } 6365 #endif 6366 6367 for (i = 0; i < __kmp_threads_capacity; ++i) { 6368 // TBD: Add some checking... 6369 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6370 } 6371 6372 /* Make sure all threadprivate destructors get run by joining with all 6373 worker threads before resetting this flag */ 6374 TCW_SYNC_4(__kmp_init_common, FALSE); 6375 6376 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6377 KMP_MB(); 6378 6379 #if KMP_USE_MONITOR 6380 // See note above: One of the possible fixes for CQ138434 / CQ140126 6381 // 6382 // FIXME: push both code fragments down and CSE them? 6383 // push them into __kmp_cleanup() ? 6384 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6385 if (TCR_4(__kmp_init_monitor)) { 6386 __kmp_reap_monitor(&__kmp_monitor); 6387 TCW_4(__kmp_init_monitor, 0); 6388 } 6389 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6390 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6391 #endif 6392 } /* else !__kmp_global.t_active */ 6393 TCW_4(__kmp_init_gtid, FALSE); 6394 KMP_MB(); /* Flush all pending memory write invalidates. */ 6395 6396 __kmp_cleanup(); 6397 #if OMPT_SUPPORT 6398 ompt_fini(); 6399 #endif 6400 } 6401 6402 void __kmp_internal_end_library(int gtid_req) { 6403 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6404 /* this shouldn't be a race condition because __kmp_internal_end() is the 6405 only place to clear __kmp_serial_init */ 6406 /* we'll check this later too, after we get the lock */ 6407 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6408 // redundant, because the next check will work in any case. 6409 if (__kmp_global.g.g_abort) { 6410 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6411 /* TODO abort? */ 6412 return; 6413 } 6414 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6415 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6416 return; 6417 } 6418 6419 // If hidden helper team has been initialized, we need to deinit it 6420 if (TCR_4(__kmp_init_hidden_helper) && 6421 !TCR_4(__kmp_hidden_helper_team_done)) { 6422 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6423 // First release the main thread to let it continue its work 6424 __kmp_hidden_helper_main_thread_release(); 6425 // Wait until the hidden helper team has been destroyed 6426 __kmp_hidden_helper_threads_deinitz_wait(); 6427 } 6428 6429 KMP_MB(); /* Flush all pending memory write invalidates. */ 6430 /* find out who we are and what we should do */ 6431 { 6432 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6433 KA_TRACE( 6434 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6435 if (gtid == KMP_GTID_SHUTDOWN) { 6436 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6437 "already shutdown\n")); 6438 return; 6439 } else if (gtid == KMP_GTID_MONITOR) { 6440 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6441 "registered, or system shutdown\n")); 6442 return; 6443 } else if (gtid == KMP_GTID_DNE) { 6444 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6445 "shutdown\n")); 6446 /* we don't know who we are, but we may still shutdown the library */ 6447 } else if (KMP_UBER_GTID(gtid)) { 6448 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6449 if (__kmp_root[gtid]->r.r_active) { 6450 __kmp_global.g.g_abort = -1; 6451 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6452 __kmp_unregister_library(); 6453 KA_TRACE(10, 6454 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6455 gtid)); 6456 return; 6457 } else { 6458 __kmp_itthash_clean(__kmp_threads[gtid]); 6459 KA_TRACE( 6460 10, 6461 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6462 __kmp_unregister_root_current_thread(gtid); 6463 } 6464 } else { 6465 /* worker threads may call this function through the atexit handler, if they 6466 * call exit() */ 6467 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6468 TODO: do a thorough shutdown instead */ 6469 #ifdef DUMP_DEBUG_ON_EXIT 6470 if (__kmp_debug_buf) 6471 __kmp_dump_debug_buffer(); 6472 #endif 6473 // added unregister library call here when we switch to shm linux 6474 // if we don't, it will leave lots of files in /dev/shm 6475 // cleanup shared memory file before exiting. 6476 __kmp_unregister_library(); 6477 return; 6478 } 6479 } 6480 /* synchronize the termination process */ 6481 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6482 6483 /* have we already finished */ 6484 if (__kmp_global.g.g_abort) { 6485 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6486 /* TODO abort? */ 6487 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6488 return; 6489 } 6490 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6491 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6492 return; 6493 } 6494 6495 /* We need this lock to enforce mutex between this reading of 6496 __kmp_threads_capacity and the writing by __kmp_register_root. 6497 Alternatively, we can use a counter of roots that is atomically updated by 6498 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6499 __kmp_internal_end_*. */ 6500 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6501 6502 /* now we can safely conduct the actual termination */ 6503 __kmp_internal_end(); 6504 6505 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6506 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6507 6508 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6509 6510 #ifdef DUMP_DEBUG_ON_EXIT 6511 if (__kmp_debug_buf) 6512 __kmp_dump_debug_buffer(); 6513 #endif 6514 6515 #if KMP_OS_WINDOWS 6516 __kmp_close_console(); 6517 #endif 6518 6519 __kmp_fini_allocator(); 6520 6521 } // __kmp_internal_end_library 6522 6523 void __kmp_internal_end_thread(int gtid_req) { 6524 int i; 6525 6526 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6527 /* this shouldn't be a race condition because __kmp_internal_end() is the 6528 * only place to clear __kmp_serial_init */ 6529 /* we'll check this later too, after we get the lock */ 6530 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6531 // redundant, because the next check will work in any case. 6532 if (__kmp_global.g.g_abort) { 6533 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6534 /* TODO abort? */ 6535 return; 6536 } 6537 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6538 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6539 return; 6540 } 6541 6542 // If hidden helper team has been initialized, we need to deinit it 6543 if (TCR_4(__kmp_init_hidden_helper) && 6544 !TCR_4(__kmp_hidden_helper_team_done)) { 6545 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6546 // First release the main thread to let it continue its work 6547 __kmp_hidden_helper_main_thread_release(); 6548 // Wait until the hidden helper team has been destroyed 6549 __kmp_hidden_helper_threads_deinitz_wait(); 6550 } 6551 6552 KMP_MB(); /* Flush all pending memory write invalidates. */ 6553 6554 /* find out who we are and what we should do */ 6555 { 6556 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6557 KA_TRACE(10, 6558 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6559 if (gtid == KMP_GTID_SHUTDOWN) { 6560 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6561 "already shutdown\n")); 6562 return; 6563 } else if (gtid == KMP_GTID_MONITOR) { 6564 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6565 "registered, or system shutdown\n")); 6566 return; 6567 } else if (gtid == KMP_GTID_DNE) { 6568 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6569 "shutdown\n")); 6570 return; 6571 /* we don't know who we are */ 6572 } else if (KMP_UBER_GTID(gtid)) { 6573 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6574 if (__kmp_root[gtid]->r.r_active) { 6575 __kmp_global.g.g_abort = -1; 6576 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6577 KA_TRACE(10, 6578 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6579 gtid)); 6580 return; 6581 } else { 6582 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6583 gtid)); 6584 __kmp_unregister_root_current_thread(gtid); 6585 } 6586 } else { 6587 /* just a worker thread, let's leave */ 6588 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6589 6590 if (gtid >= 0) { 6591 __kmp_threads[gtid]->th.th_task_team = NULL; 6592 } 6593 6594 KA_TRACE(10, 6595 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6596 gtid)); 6597 return; 6598 } 6599 } 6600 #if KMP_DYNAMIC_LIB 6601 if (__kmp_pause_status != kmp_hard_paused) 6602 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6603 // because we will better shutdown later in the library destructor. 6604 { 6605 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6606 return; 6607 } 6608 #endif 6609 /* synchronize the termination process */ 6610 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6611 6612 /* have we already finished */ 6613 if (__kmp_global.g.g_abort) { 6614 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6615 /* TODO abort? */ 6616 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6617 return; 6618 } 6619 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6620 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6621 return; 6622 } 6623 6624 /* We need this lock to enforce mutex between this reading of 6625 __kmp_threads_capacity and the writing by __kmp_register_root. 6626 Alternatively, we can use a counter of roots that is atomically updated by 6627 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6628 __kmp_internal_end_*. */ 6629 6630 /* should we finish the run-time? are all siblings done? */ 6631 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6632 6633 for (i = 0; i < __kmp_threads_capacity; ++i) { 6634 if (KMP_UBER_GTID(i)) { 6635 KA_TRACE( 6636 10, 6637 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6638 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6639 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6640 return; 6641 } 6642 } 6643 6644 /* now we can safely conduct the actual termination */ 6645 6646 __kmp_internal_end(); 6647 6648 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6649 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6650 6651 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6652 6653 #ifdef DUMP_DEBUG_ON_EXIT 6654 if (__kmp_debug_buf) 6655 __kmp_dump_debug_buffer(); 6656 #endif 6657 } // __kmp_internal_end_thread 6658 6659 // ----------------------------------------------------------------------------- 6660 // Library registration stuff. 6661 6662 static long __kmp_registration_flag = 0; 6663 // Random value used to indicate library initialization. 6664 static char *__kmp_registration_str = NULL; 6665 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6666 6667 static inline char *__kmp_reg_status_name() { 6668 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6669 each thread. If registration and unregistration go in different threads 6670 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6671 env var can not be found, because the name will contain different pid. */ 6672 // macOS* complains about name being too long with additional getuid() 6673 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6674 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6675 (int)getuid()); 6676 #else 6677 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6678 #endif 6679 } // __kmp_reg_status_get 6680 6681 void __kmp_register_library_startup(void) { 6682 6683 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6684 int done = 0; 6685 union { 6686 double dtime; 6687 long ltime; 6688 } time; 6689 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6690 __kmp_initialize_system_tick(); 6691 #endif 6692 __kmp_read_system_time(&time.dtime); 6693 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6694 __kmp_registration_str = 6695 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6696 __kmp_registration_flag, KMP_LIBRARY_FILE); 6697 6698 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6699 __kmp_registration_str)); 6700 6701 while (!done) { 6702 6703 char *value = NULL; // Actual value of the environment variable. 6704 6705 #if defined(KMP_USE_SHM) 6706 char *shm_name = __kmp_str_format("/%s", name); 6707 int shm_preexist = 0; 6708 char *data1; 6709 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6710 if ((fd1 == -1) && (errno == EEXIST)) { 6711 // file didn't open because it already exists. 6712 // try opening existing file 6713 fd1 = shm_open(shm_name, O_RDWR, 0666); 6714 if (fd1 == -1) { // file didn't open 6715 // error out here 6716 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6717 __kmp_msg_null); 6718 } else { 6719 // able to open existing file 6720 shm_preexist = 1; 6721 } 6722 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6723 // already exists. 6724 // error out here. 6725 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6726 __kmp_msg_null); 6727 } 6728 if (shm_preexist == 0) { 6729 // we created SHM now set size 6730 if (ftruncate(fd1, SHM_SIZE) == -1) { 6731 // error occured setting size; 6732 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6733 KMP_ERR(errno), __kmp_msg_null); 6734 } 6735 } 6736 data1 = 6737 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6738 if (data1 == MAP_FAILED) { 6739 // failed to map shared memory 6740 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6741 __kmp_msg_null); 6742 } 6743 if (shm_preexist == 0) { // set data to SHM, set value 6744 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6745 } 6746 // Read value from either what we just wrote or existing file. 6747 value = __kmp_str_format("%s", data1); // read value from SHM 6748 munmap(data1, SHM_SIZE); 6749 close(fd1); 6750 #else // Windows and unix with static library 6751 // Set environment variable, but do not overwrite if it is exist. 6752 __kmp_env_set(name, __kmp_registration_str, 0); 6753 // read value to see if it got set 6754 value = __kmp_env_get(name); 6755 #endif 6756 6757 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6758 done = 1; // Ok, environment variable set successfully, exit the loop. 6759 } else { 6760 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6761 // Check whether it alive or dead. 6762 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6763 char *tail = value; 6764 char *flag_addr_str = NULL; 6765 char *flag_val_str = NULL; 6766 char const *file_name = NULL; 6767 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6768 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6769 file_name = tail; 6770 if (tail != NULL) { 6771 unsigned long *flag_addr = 0; 6772 unsigned long flag_val = 0; 6773 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6774 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6775 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6776 // First, check whether environment-encoded address is mapped into 6777 // addr space. 6778 // If so, dereference it to see if it still has the right value. 6779 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6780 neighbor = 1; 6781 } else { 6782 // If not, then we know the other copy of the library is no longer 6783 // running. 6784 neighbor = 2; 6785 } 6786 } 6787 } 6788 switch (neighbor) { 6789 case 0: // Cannot parse environment variable -- neighbor status unknown. 6790 // Assume it is the incompatible format of future version of the 6791 // library. Assume the other library is alive. 6792 // WARN( ... ); // TODO: Issue a warning. 6793 file_name = "unknown library"; 6794 KMP_FALLTHROUGH(); 6795 // Attention! Falling to the next case. That's intentional. 6796 case 1: { // Neighbor is alive. 6797 // Check it is allowed. 6798 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6799 if (!__kmp_str_match_true(duplicate_ok)) { 6800 // That's not allowed. Issue fatal error. 6801 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6802 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6803 } 6804 KMP_INTERNAL_FREE(duplicate_ok); 6805 __kmp_duplicate_library_ok = 1; 6806 done = 1; // Exit the loop. 6807 } break; 6808 case 2: { // Neighbor is dead. 6809 6810 #if defined(KMP_USE_SHM) 6811 // close shared memory. 6812 shm_unlink(shm_name); // this removes file in /dev/shm 6813 #else 6814 // Clear the variable and try to register library again. 6815 __kmp_env_unset(name); 6816 #endif 6817 } break; 6818 default: { 6819 KMP_DEBUG_ASSERT(0); 6820 } break; 6821 } 6822 } 6823 KMP_INTERNAL_FREE((void *)value); 6824 #if defined(KMP_USE_SHM) 6825 KMP_INTERNAL_FREE((void *)shm_name); 6826 #endif 6827 } // while 6828 KMP_INTERNAL_FREE((void *)name); 6829 6830 } // func __kmp_register_library_startup 6831 6832 void __kmp_unregister_library(void) { 6833 6834 char *name = __kmp_reg_status_name(); 6835 char *value = NULL; 6836 6837 #if defined(KMP_USE_SHM) 6838 char *shm_name = __kmp_str_format("/%s", name); 6839 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6840 if (fd1 == -1) { 6841 // file did not open. return. 6842 return; 6843 } 6844 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6845 if (data1 != MAP_FAILED) { 6846 value = __kmp_str_format("%s", data1); // read value from SHM 6847 munmap(data1, SHM_SIZE); 6848 } 6849 close(fd1); 6850 #else 6851 value = __kmp_env_get(name); 6852 #endif 6853 6854 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6855 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6856 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6857 // Ok, this is our variable. Delete it. 6858 #if defined(KMP_USE_SHM) 6859 shm_unlink(shm_name); // this removes file in /dev/shm 6860 #else 6861 __kmp_env_unset(name); 6862 #endif 6863 } 6864 6865 #if defined(KMP_USE_SHM) 6866 KMP_INTERNAL_FREE(shm_name); 6867 #endif 6868 6869 KMP_INTERNAL_FREE(__kmp_registration_str); 6870 KMP_INTERNAL_FREE(value); 6871 KMP_INTERNAL_FREE(name); 6872 6873 __kmp_registration_flag = 0; 6874 __kmp_registration_str = NULL; 6875 6876 } // __kmp_unregister_library 6877 6878 // End of Library registration stuff. 6879 // ----------------------------------------------------------------------------- 6880 6881 #if KMP_MIC_SUPPORTED 6882 6883 static void __kmp_check_mic_type() { 6884 kmp_cpuid_t cpuid_state = {0}; 6885 kmp_cpuid_t *cs_p = &cpuid_state; 6886 __kmp_x86_cpuid(1, 0, cs_p); 6887 // We don't support mic1 at the moment 6888 if ((cs_p->eax & 0xff0) == 0xB10) { 6889 __kmp_mic_type = mic2; 6890 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6891 __kmp_mic_type = mic3; 6892 } else { 6893 __kmp_mic_type = non_mic; 6894 } 6895 } 6896 6897 #endif /* KMP_MIC_SUPPORTED */ 6898 6899 #if KMP_HAVE_UMWAIT 6900 static void __kmp_user_level_mwait_init() { 6901 struct kmp_cpuid buf; 6902 __kmp_x86_cpuid(7, 0, &buf); 6903 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1); 6904 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait; 6905 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0); 6906 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6907 __kmp_umwait_enabled)); 6908 } 6909 #elif KMP_HAVE_MWAIT 6910 #ifndef AT_INTELPHIUSERMWAIT 6911 // Spurious, non-existent value that should always fail to return anything. 6912 // Will be replaced with the correct value when we know that. 6913 #define AT_INTELPHIUSERMWAIT 10000 6914 #endif 6915 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6916 // earlier OS is used to build the RTL, we'll use the following internal 6917 // function when the entry is not found. 6918 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6919 unsigned long getauxval(unsigned long) { return 0; } 6920 6921 static void __kmp_user_level_mwait_init() { 6922 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6923 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6924 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6925 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6926 if (__kmp_mic_type == mic3) { 6927 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6928 if ((res & 0x1) || __kmp_user_level_mwait) { 6929 __kmp_mwait_enabled = TRUE; 6930 if (__kmp_user_level_mwait) { 6931 KMP_INFORM(EnvMwaitWarn); 6932 } 6933 } else { 6934 __kmp_mwait_enabled = FALSE; 6935 } 6936 } 6937 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6938 "__kmp_mwait_enabled = %d\n", 6939 __kmp_mic_type, __kmp_mwait_enabled)); 6940 } 6941 #endif /* KMP_HAVE_UMWAIT */ 6942 6943 static void __kmp_do_serial_initialize(void) { 6944 int i, gtid; 6945 size_t size; 6946 6947 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6948 6949 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6950 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6951 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6952 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6953 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6954 6955 #if OMPT_SUPPORT 6956 ompt_pre_init(); 6957 #endif 6958 #if OMPD_SUPPORT 6959 __kmp_env_dump(); 6960 ompd_init(); 6961 #endif 6962 6963 __kmp_validate_locks(); 6964 6965 /* Initialize internal memory allocator */ 6966 __kmp_init_allocator(); 6967 6968 /* Register the library startup via an environment variable and check to see 6969 whether another copy of the library is already registered. */ 6970 6971 __kmp_register_library_startup(); 6972 6973 /* TODO reinitialization of library */ 6974 if (TCR_4(__kmp_global.g.g_done)) { 6975 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6976 } 6977 6978 __kmp_global.g.g_abort = 0; 6979 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6980 6981 /* initialize the locks */ 6982 #if KMP_USE_ADAPTIVE_LOCKS 6983 #if KMP_DEBUG_ADAPTIVE_LOCKS 6984 __kmp_init_speculative_stats(); 6985 #endif 6986 #endif 6987 #if KMP_STATS_ENABLED 6988 __kmp_stats_init(); 6989 #endif 6990 __kmp_init_lock(&__kmp_global_lock); 6991 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6992 __kmp_init_lock(&__kmp_debug_lock); 6993 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6994 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6995 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6996 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6997 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6998 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6999 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 7000 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 7001 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 7002 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 7003 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 7004 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 7005 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 7006 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 7007 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 7008 #if KMP_USE_MONITOR 7009 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 7010 #endif 7011 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 7012 7013 /* conduct initialization and initial setup of configuration */ 7014 7015 __kmp_runtime_initialize(); 7016 7017 #if KMP_MIC_SUPPORTED 7018 __kmp_check_mic_type(); 7019 #endif 7020 7021 // Some global variable initialization moved here from kmp_env_initialize() 7022 #ifdef KMP_DEBUG 7023 kmp_diag = 0; 7024 #endif 7025 __kmp_abort_delay = 0; 7026 7027 // From __kmp_init_dflt_team_nth() 7028 /* assume the entire machine will be used */ 7029 __kmp_dflt_team_nth_ub = __kmp_xproc; 7030 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 7031 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 7032 } 7033 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 7034 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 7035 } 7036 __kmp_max_nth = __kmp_sys_max_nth; 7037 __kmp_cg_max_nth = __kmp_sys_max_nth; 7038 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 7039 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 7040 __kmp_teams_max_nth = __kmp_sys_max_nth; 7041 } 7042 7043 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 7044 // part 7045 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 7046 #if KMP_USE_MONITOR 7047 __kmp_monitor_wakeups = 7048 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7049 __kmp_bt_intervals = 7050 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 7051 #endif 7052 // From "KMP_LIBRARY" part of __kmp_env_initialize() 7053 __kmp_library = library_throughput; 7054 // From KMP_SCHEDULE initialization 7055 __kmp_static = kmp_sch_static_balanced; 7056 // AC: do not use analytical here, because it is non-monotonous 7057 //__kmp_guided = kmp_sch_guided_iterative_chunked; 7058 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 7059 // need to repeat assignment 7060 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 7061 // bit control and barrier method control parts 7062 #if KMP_FAST_REDUCTION_BARRIER 7063 #define kmp_reduction_barrier_gather_bb ((int)1) 7064 #define kmp_reduction_barrier_release_bb ((int)1) 7065 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 7066 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 7067 #endif // KMP_FAST_REDUCTION_BARRIER 7068 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 7069 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 7070 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 7071 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 7072 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 7073 #if KMP_FAST_REDUCTION_BARRIER 7074 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 7075 // lin_64 ): hyper,1 7076 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 7077 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 7078 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 7079 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 7080 } 7081 #endif // KMP_FAST_REDUCTION_BARRIER 7082 } 7083 #if KMP_FAST_REDUCTION_BARRIER 7084 #undef kmp_reduction_barrier_release_pat 7085 #undef kmp_reduction_barrier_gather_pat 7086 #undef kmp_reduction_barrier_release_bb 7087 #undef kmp_reduction_barrier_gather_bb 7088 #endif // KMP_FAST_REDUCTION_BARRIER 7089 #if KMP_MIC_SUPPORTED 7090 if (__kmp_mic_type == mic2) { // KNC 7091 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7092 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7093 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7094 1; // forkjoin release 7095 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7096 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7097 } 7098 #if KMP_FAST_REDUCTION_BARRIER 7099 if (__kmp_mic_type == mic2) { // KNC 7100 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7101 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7102 } 7103 #endif // KMP_FAST_REDUCTION_BARRIER 7104 #endif // KMP_MIC_SUPPORTED 7105 7106 // From KMP_CHECKS initialization 7107 #ifdef KMP_DEBUG 7108 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7109 #else 7110 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7111 #endif 7112 7113 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7114 __kmp_foreign_tp = TRUE; 7115 7116 __kmp_global.g.g_dynamic = FALSE; 7117 __kmp_global.g.g_dynamic_mode = dynamic_default; 7118 7119 __kmp_init_nesting_mode(); 7120 7121 __kmp_env_initialize(NULL); 7122 7123 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7124 __kmp_user_level_mwait_init(); 7125 #endif 7126 // Print all messages in message catalog for testing purposes. 7127 #ifdef KMP_DEBUG 7128 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7129 if (__kmp_str_match_true(val)) { 7130 kmp_str_buf_t buffer; 7131 __kmp_str_buf_init(&buffer); 7132 __kmp_i18n_dump_catalog(&buffer); 7133 __kmp_printf("%s", buffer.str); 7134 __kmp_str_buf_free(&buffer); 7135 } 7136 __kmp_env_free(&val); 7137 #endif 7138 7139 __kmp_threads_capacity = 7140 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7141 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7142 __kmp_tp_capacity = __kmp_default_tp_capacity( 7143 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7144 7145 // If the library is shut down properly, both pools must be NULL. Just in 7146 // case, set them to NULL -- some memory may leak, but subsequent code will 7147 // work even if pools are not freed. 7148 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7149 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7150 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7151 __kmp_thread_pool = NULL; 7152 __kmp_thread_pool_insert_pt = NULL; 7153 __kmp_team_pool = NULL; 7154 7155 /* Allocate all of the variable sized records */ 7156 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7157 * expandable */ 7158 /* Since allocation is cache-aligned, just add extra padding at the end */ 7159 size = 7160 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7161 CACHE_LINE; 7162 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7163 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7164 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7165 7166 /* init thread counts */ 7167 KMP_DEBUG_ASSERT(__kmp_all_nth == 7168 0); // Asserts fail if the library is reinitializing and 7169 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7170 __kmp_all_nth = 0; 7171 __kmp_nth = 0; 7172 7173 /* setup the uber master thread and hierarchy */ 7174 gtid = __kmp_register_root(TRUE); 7175 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7176 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7177 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7178 7179 KMP_MB(); /* Flush all pending memory write invalidates. */ 7180 7181 __kmp_common_initialize(); 7182 7183 #if KMP_OS_UNIX 7184 /* invoke the child fork handler */ 7185 __kmp_register_atfork(); 7186 #endif 7187 7188 #if !KMP_DYNAMIC_LIB 7189 { 7190 /* Invoke the exit handler when the program finishes, only for static 7191 library. For dynamic library, we already have _fini and DllMain. */ 7192 int rc = atexit(__kmp_internal_end_atexit); 7193 if (rc != 0) { 7194 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7195 __kmp_msg_null); 7196 } 7197 } 7198 #endif 7199 7200 #if KMP_HANDLE_SIGNALS 7201 #if KMP_OS_UNIX 7202 /* NOTE: make sure that this is called before the user installs their own 7203 signal handlers so that the user handlers are called first. this way they 7204 can return false, not call our handler, avoid terminating the library, and 7205 continue execution where they left off. */ 7206 __kmp_install_signals(FALSE); 7207 #endif /* KMP_OS_UNIX */ 7208 #if KMP_OS_WINDOWS 7209 __kmp_install_signals(TRUE); 7210 #endif /* KMP_OS_WINDOWS */ 7211 #endif 7212 7213 /* we have finished the serial initialization */ 7214 __kmp_init_counter++; 7215 7216 __kmp_init_serial = TRUE; 7217 7218 if (__kmp_settings) { 7219 __kmp_env_print(); 7220 } 7221 7222 if (__kmp_display_env || __kmp_display_env_verbose) { 7223 __kmp_env_print_2(); 7224 } 7225 7226 #if OMPT_SUPPORT 7227 ompt_post_init(); 7228 #endif 7229 7230 KMP_MB(); 7231 7232 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7233 } 7234 7235 void __kmp_serial_initialize(void) { 7236 if (__kmp_init_serial) { 7237 return; 7238 } 7239 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7240 if (__kmp_init_serial) { 7241 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7242 return; 7243 } 7244 __kmp_do_serial_initialize(); 7245 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7246 } 7247 7248 static void __kmp_do_middle_initialize(void) { 7249 int i, j; 7250 int prev_dflt_team_nth; 7251 7252 if (!__kmp_init_serial) { 7253 __kmp_do_serial_initialize(); 7254 } 7255 7256 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7257 7258 // Save the previous value for the __kmp_dflt_team_nth so that 7259 // we can avoid some reinitialization if it hasn't changed. 7260 prev_dflt_team_nth = __kmp_dflt_team_nth; 7261 7262 #if KMP_AFFINITY_SUPPORTED 7263 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7264 // number of cores on the machine. 7265 __kmp_affinity_initialize(); 7266 7267 #endif /* KMP_AFFINITY_SUPPORTED */ 7268 7269 KMP_ASSERT(__kmp_xproc > 0); 7270 if (__kmp_avail_proc == 0) { 7271 __kmp_avail_proc = __kmp_xproc; 7272 } 7273 7274 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7275 // correct them now 7276 j = 0; 7277 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7278 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7279 __kmp_avail_proc; 7280 j++; 7281 } 7282 7283 if (__kmp_dflt_team_nth == 0) { 7284 #ifdef KMP_DFLT_NTH_CORES 7285 // Default #threads = #cores 7286 __kmp_dflt_team_nth = __kmp_ncores; 7287 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7288 "__kmp_ncores (%d)\n", 7289 __kmp_dflt_team_nth)); 7290 #else 7291 // Default #threads = #available OS procs 7292 __kmp_dflt_team_nth = __kmp_avail_proc; 7293 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7294 "__kmp_avail_proc(%d)\n", 7295 __kmp_dflt_team_nth)); 7296 #endif /* KMP_DFLT_NTH_CORES */ 7297 } 7298 7299 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7300 __kmp_dflt_team_nth = KMP_MIN_NTH; 7301 } 7302 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7303 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7304 } 7305 7306 if (__kmp_nesting_mode > 0) 7307 __kmp_set_nesting_mode_threads(); 7308 7309 // There's no harm in continuing if the following check fails, 7310 // but it indicates an error in the previous logic. 7311 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7312 7313 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7314 // Run through the __kmp_threads array and set the num threads icv for each 7315 // root thread that is currently registered with the RTL (which has not 7316 // already explicitly set its nthreads-var with a call to 7317 // omp_set_num_threads()). 7318 for (i = 0; i < __kmp_threads_capacity; i++) { 7319 kmp_info_t *thread = __kmp_threads[i]; 7320 if (thread == NULL) 7321 continue; 7322 if (thread->th.th_current_task->td_icvs.nproc != 0) 7323 continue; 7324 7325 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7326 } 7327 } 7328 KA_TRACE( 7329 20, 7330 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7331 __kmp_dflt_team_nth)); 7332 7333 #ifdef KMP_ADJUST_BLOCKTIME 7334 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7335 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7336 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7337 if (__kmp_nth > __kmp_avail_proc) { 7338 __kmp_zero_bt = TRUE; 7339 } 7340 } 7341 #endif /* KMP_ADJUST_BLOCKTIME */ 7342 7343 /* we have finished middle initialization */ 7344 TCW_SYNC_4(__kmp_init_middle, TRUE); 7345 7346 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7347 } 7348 7349 void __kmp_middle_initialize(void) { 7350 if (__kmp_init_middle) { 7351 return; 7352 } 7353 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7354 if (__kmp_init_middle) { 7355 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7356 return; 7357 } 7358 __kmp_do_middle_initialize(); 7359 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7360 } 7361 7362 void __kmp_parallel_initialize(void) { 7363 int gtid = __kmp_entry_gtid(); // this might be a new root 7364 7365 /* synchronize parallel initialization (for sibling) */ 7366 if (TCR_4(__kmp_init_parallel)) 7367 return; 7368 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7369 if (TCR_4(__kmp_init_parallel)) { 7370 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7371 return; 7372 } 7373 7374 /* TODO reinitialization after we have already shut down */ 7375 if (TCR_4(__kmp_global.g.g_done)) { 7376 KA_TRACE( 7377 10, 7378 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7379 __kmp_infinite_loop(); 7380 } 7381 7382 /* jc: The lock __kmp_initz_lock is already held, so calling 7383 __kmp_serial_initialize would cause a deadlock. So we call 7384 __kmp_do_serial_initialize directly. */ 7385 if (!__kmp_init_middle) { 7386 __kmp_do_middle_initialize(); 7387 } 7388 __kmp_assign_root_init_mask(); 7389 __kmp_resume_if_hard_paused(); 7390 7391 /* begin initialization */ 7392 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7393 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7394 7395 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7396 // Save the FP control regs. 7397 // Worker threads will set theirs to these values at thread startup. 7398 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7399 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7400 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7401 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7402 7403 #if KMP_OS_UNIX 7404 #if KMP_HANDLE_SIGNALS 7405 /* must be after __kmp_serial_initialize */ 7406 __kmp_install_signals(TRUE); 7407 #endif 7408 #endif 7409 7410 __kmp_suspend_initialize(); 7411 7412 #if defined(USE_LOAD_BALANCE) 7413 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7414 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7415 } 7416 #else 7417 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7418 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7419 } 7420 #endif 7421 7422 if (__kmp_version) { 7423 __kmp_print_version_2(); 7424 } 7425 7426 /* we have finished parallel initialization */ 7427 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7428 7429 KMP_MB(); 7430 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7431 7432 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7433 } 7434 7435 void __kmp_hidden_helper_initialize() { 7436 if (TCR_4(__kmp_init_hidden_helper)) 7437 return; 7438 7439 // __kmp_parallel_initialize is required before we initialize hidden helper 7440 if (!TCR_4(__kmp_init_parallel)) 7441 __kmp_parallel_initialize(); 7442 7443 // Double check. Note that this double check should not be placed before 7444 // __kmp_parallel_initialize as it will cause dead lock. 7445 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7446 if (TCR_4(__kmp_init_hidden_helper)) { 7447 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7448 return; 7449 } 7450 7451 // Set the count of hidden helper tasks to be executed to zero 7452 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7453 7454 // Set the global variable indicating that we're initializing hidden helper 7455 // team/threads 7456 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7457 7458 // Platform independent initialization 7459 __kmp_do_initialize_hidden_helper_threads(); 7460 7461 // Wait here for the finish of initialization of hidden helper teams 7462 __kmp_hidden_helper_threads_initz_wait(); 7463 7464 // We have finished hidden helper initialization 7465 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7466 7467 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7468 } 7469 7470 /* ------------------------------------------------------------------------ */ 7471 7472 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7473 kmp_team_t *team) { 7474 kmp_disp_t *dispatch; 7475 7476 KMP_MB(); 7477 7478 /* none of the threads have encountered any constructs, yet. */ 7479 this_thr->th.th_local.this_construct = 0; 7480 #if KMP_CACHE_MANAGE 7481 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7482 #endif /* KMP_CACHE_MANAGE */ 7483 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7484 KMP_DEBUG_ASSERT(dispatch); 7485 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7486 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7487 // this_thr->th.th_info.ds.ds_tid ] ); 7488 7489 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7490 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7491 if (__kmp_env_consistency_check) 7492 __kmp_push_parallel(gtid, team->t.t_ident); 7493 7494 KMP_MB(); /* Flush all pending memory write invalidates. */ 7495 } 7496 7497 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7498 kmp_team_t *team) { 7499 if (__kmp_env_consistency_check) 7500 __kmp_pop_parallel(gtid, team->t.t_ident); 7501 7502 __kmp_finish_implicit_task(this_thr); 7503 } 7504 7505 int __kmp_invoke_task_func(int gtid) { 7506 int rc; 7507 int tid = __kmp_tid_from_gtid(gtid); 7508 kmp_info_t *this_thr = __kmp_threads[gtid]; 7509 kmp_team_t *team = this_thr->th.th_team; 7510 7511 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7512 #if USE_ITT_BUILD 7513 if (__itt_stack_caller_create_ptr) { 7514 // inform ittnotify about entering user's code 7515 if (team->t.t_stack_id != NULL) { 7516 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7517 } else { 7518 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7519 __kmp_itt_stack_callee_enter( 7520 (__itt_caller)team->t.t_parent->t.t_stack_id); 7521 } 7522 } 7523 #endif /* USE_ITT_BUILD */ 7524 #if INCLUDE_SSC_MARKS 7525 SSC_MARK_INVOKING(); 7526 #endif 7527 7528 #if OMPT_SUPPORT 7529 void *dummy; 7530 void **exit_frame_p; 7531 ompt_data_t *my_task_data; 7532 ompt_data_t *my_parallel_data; 7533 int ompt_team_size; 7534 7535 if (ompt_enabled.enabled) { 7536 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7537 .ompt_task_info.frame.exit_frame.ptr); 7538 } else { 7539 exit_frame_p = &dummy; 7540 } 7541 7542 my_task_data = 7543 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7544 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7545 if (ompt_enabled.ompt_callback_implicit_task) { 7546 ompt_team_size = team->t.t_nproc; 7547 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7548 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7549 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7550 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7551 } 7552 #endif 7553 7554 #if KMP_STATS_ENABLED 7555 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7556 if (previous_state == stats_state_e::TEAMS_REGION) { 7557 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7558 } else { 7559 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7560 } 7561 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7562 #endif 7563 7564 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7565 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7566 #if OMPT_SUPPORT 7567 , 7568 exit_frame_p 7569 #endif 7570 ); 7571 #if OMPT_SUPPORT 7572 *exit_frame_p = NULL; 7573 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7574 #endif 7575 7576 #if KMP_STATS_ENABLED 7577 if (previous_state == stats_state_e::TEAMS_REGION) { 7578 KMP_SET_THREAD_STATE(previous_state); 7579 } 7580 KMP_POP_PARTITIONED_TIMER(); 7581 #endif 7582 7583 #if USE_ITT_BUILD 7584 if (__itt_stack_caller_create_ptr) { 7585 // inform ittnotify about leaving user's code 7586 if (team->t.t_stack_id != NULL) { 7587 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7588 } else { 7589 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7590 __kmp_itt_stack_callee_leave( 7591 (__itt_caller)team->t.t_parent->t.t_stack_id); 7592 } 7593 } 7594 #endif /* USE_ITT_BUILD */ 7595 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7596 7597 return rc; 7598 } 7599 7600 void __kmp_teams_master(int gtid) { 7601 // This routine is called by all primary threads in teams construct 7602 kmp_info_t *thr = __kmp_threads[gtid]; 7603 kmp_team_t *team = thr->th.th_team; 7604 ident_t *loc = team->t.t_ident; 7605 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7606 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7607 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7608 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7609 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7610 7611 // This thread is a new CG root. Set up the proper variables. 7612 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7613 tmp->cg_root = thr; // Make thr the CG root 7614 // Init to thread limit stored when league primary threads were forked 7615 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7616 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7617 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7618 " cg_nthreads to 1\n", 7619 thr, tmp)); 7620 tmp->up = thr->th.th_cg_roots; 7621 thr->th.th_cg_roots = tmp; 7622 7623 // Launch league of teams now, but not let workers execute 7624 // (they hang on fork barrier until next parallel) 7625 #if INCLUDE_SSC_MARKS 7626 SSC_MARK_FORKING(); 7627 #endif 7628 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7629 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7630 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7631 #if INCLUDE_SSC_MARKS 7632 SSC_MARK_JOINING(); 7633 #endif 7634 // If the team size was reduced from the limit, set it to the new size 7635 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7636 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7637 // AC: last parameter "1" eliminates join barrier which won't work because 7638 // worker threads are in a fork barrier waiting for more parallel regions 7639 __kmp_join_call(loc, gtid 7640 #if OMPT_SUPPORT 7641 , 7642 fork_context_intel 7643 #endif 7644 , 7645 1); 7646 } 7647 7648 int __kmp_invoke_teams_master(int gtid) { 7649 kmp_info_t *this_thr = __kmp_threads[gtid]; 7650 kmp_team_t *team = this_thr->th.th_team; 7651 #if KMP_DEBUG 7652 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7653 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7654 (void *)__kmp_teams_master); 7655 #endif 7656 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7657 #if OMPT_SUPPORT 7658 int tid = __kmp_tid_from_gtid(gtid); 7659 ompt_data_t *task_data = 7660 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7661 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7662 if (ompt_enabled.ompt_callback_implicit_task) { 7663 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7664 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7665 ompt_task_initial); 7666 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7667 } 7668 #endif 7669 __kmp_teams_master(gtid); 7670 #if OMPT_SUPPORT 7671 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7672 #endif 7673 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7674 return 1; 7675 } 7676 7677 /* this sets the requested number of threads for the next parallel region 7678 encountered by this team. since this should be enclosed in the forkjoin 7679 critical section it should avoid race conditions with asymmetrical nested 7680 parallelism */ 7681 7682 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7683 kmp_info_t *thr = __kmp_threads[gtid]; 7684 7685 if (num_threads > 0) 7686 thr->th.th_set_nproc = num_threads; 7687 } 7688 7689 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7690 int num_threads) { 7691 KMP_DEBUG_ASSERT(thr); 7692 // Remember the number of threads for inner parallel regions 7693 if (!TCR_4(__kmp_init_middle)) 7694 __kmp_middle_initialize(); // get internal globals calculated 7695 __kmp_assign_root_init_mask(); 7696 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7697 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7698 7699 if (num_threads == 0) { 7700 if (__kmp_teams_thread_limit > 0) { 7701 num_threads = __kmp_teams_thread_limit; 7702 } else { 7703 num_threads = __kmp_avail_proc / num_teams; 7704 } 7705 // adjust num_threads w/o warning as it is not user setting 7706 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7707 // no thread_limit clause specified - do not change thread-limit-var ICV 7708 if (num_threads > __kmp_dflt_team_nth) { 7709 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7710 } 7711 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7712 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7713 } // prevent team size to exceed thread-limit-var 7714 if (num_teams * num_threads > __kmp_teams_max_nth) { 7715 num_threads = __kmp_teams_max_nth / num_teams; 7716 } 7717 if (num_threads == 0) { 7718 num_threads = 1; 7719 } 7720 } else { 7721 if (num_threads < 0) { 7722 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1), 7723 __kmp_msg_null); 7724 num_threads = 1; 7725 } 7726 // This thread will be the primary thread of the league primary threads 7727 // Store new thread limit; old limit is saved in th_cg_roots list 7728 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7729 // num_threads = min(num_threads, nthreads-var) 7730 if (num_threads > __kmp_dflt_team_nth) { 7731 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7732 } 7733 if (num_teams * num_threads > __kmp_teams_max_nth) { 7734 int new_threads = __kmp_teams_max_nth / num_teams; 7735 if (new_threads == 0) { 7736 new_threads = 1; 7737 } 7738 if (new_threads != num_threads) { 7739 if (!__kmp_reserve_warn) { // user asked for too many threads 7740 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7741 __kmp_msg(kmp_ms_warning, 7742 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7743 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7744 } 7745 } 7746 num_threads = new_threads; 7747 } 7748 } 7749 thr->th.th_teams_size.nth = num_threads; 7750 } 7751 7752 /* this sets the requested number of teams for the teams region and/or 7753 the number of threads for the next parallel region encountered */ 7754 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7755 int num_threads) { 7756 kmp_info_t *thr = __kmp_threads[gtid]; 7757 if (num_teams < 0) { 7758 // OpenMP specification requires requested values to be positive, 7759 // but people can send us any value, so we'd better check 7760 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1), 7761 __kmp_msg_null); 7762 num_teams = 1; 7763 } 7764 if (num_teams == 0) { 7765 if (__kmp_nteams > 0) { 7766 num_teams = __kmp_nteams; 7767 } else { 7768 num_teams = 1; // default number of teams is 1. 7769 } 7770 } 7771 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7772 if (!__kmp_reserve_warn) { 7773 __kmp_reserve_warn = 1; 7774 __kmp_msg(kmp_ms_warning, 7775 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7776 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7777 } 7778 num_teams = __kmp_teams_max_nth; 7779 } 7780 // Set number of teams (number of threads in the outer "parallel" of the 7781 // teams) 7782 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7783 7784 __kmp_push_thread_limit(thr, num_teams, num_threads); 7785 } 7786 7787 /* This sets the requested number of teams for the teams region and/or 7788 the number of threads for the next parallel region encountered */ 7789 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7790 int num_teams_ub, int num_threads) { 7791 kmp_info_t *thr = __kmp_threads[gtid]; 7792 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7793 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7794 KMP_DEBUG_ASSERT(num_threads >= 0); 7795 7796 if (num_teams_lb > num_teams_ub) { 7797 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7798 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7799 } 7800 7801 int num_teams = 1; // defalt number of teams is 1. 7802 7803 if (num_teams_lb == 0 && num_teams_ub > 0) 7804 num_teams_lb = num_teams_ub; 7805 7806 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7807 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7808 if (num_teams > __kmp_teams_max_nth) { 7809 if (!__kmp_reserve_warn) { 7810 __kmp_reserve_warn = 1; 7811 __kmp_msg(kmp_ms_warning, 7812 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7813 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7814 } 7815 num_teams = __kmp_teams_max_nth; 7816 } 7817 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7818 num_teams = num_teams_ub; 7819 } else { // num_teams_lb <= num_teams <= num_teams_ub 7820 if (num_threads <= 0) { 7821 if (num_teams_ub > __kmp_teams_max_nth) { 7822 num_teams = num_teams_lb; 7823 } else { 7824 num_teams = num_teams_ub; 7825 } 7826 } else { 7827 num_teams = (num_threads > __kmp_teams_max_nth) 7828 ? num_teams 7829 : __kmp_teams_max_nth / num_threads; 7830 if (num_teams < num_teams_lb) { 7831 num_teams = num_teams_lb; 7832 } else if (num_teams > num_teams_ub) { 7833 num_teams = num_teams_ub; 7834 } 7835 } 7836 } 7837 // Set number of teams (number of threads in the outer "parallel" of the 7838 // teams) 7839 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7840 7841 __kmp_push_thread_limit(thr, num_teams, num_threads); 7842 } 7843 7844 // Set the proc_bind var to use in the following parallel region. 7845 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7846 kmp_info_t *thr = __kmp_threads[gtid]; 7847 thr->th.th_set_proc_bind = proc_bind; 7848 } 7849 7850 /* Launch the worker threads into the microtask. */ 7851 7852 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7853 kmp_info_t *this_thr = __kmp_threads[gtid]; 7854 7855 #ifdef KMP_DEBUG 7856 int f; 7857 #endif /* KMP_DEBUG */ 7858 7859 KMP_DEBUG_ASSERT(team); 7860 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7861 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7862 KMP_MB(); /* Flush all pending memory write invalidates. */ 7863 7864 team->t.t_construct = 0; /* no single directives seen yet */ 7865 team->t.t_ordered.dt.t_value = 7866 0; /* thread 0 enters the ordered section first */ 7867 7868 /* Reset the identifiers on the dispatch buffer */ 7869 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7870 if (team->t.t_max_nproc > 1) { 7871 int i; 7872 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7873 team->t.t_disp_buffer[i].buffer_index = i; 7874 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7875 } 7876 } else { 7877 team->t.t_disp_buffer[0].buffer_index = 0; 7878 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7879 } 7880 7881 KMP_MB(); /* Flush all pending memory write invalidates. */ 7882 KMP_ASSERT(this_thr->th.th_team == team); 7883 7884 #ifdef KMP_DEBUG 7885 for (f = 0; f < team->t.t_nproc; f++) { 7886 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7887 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7888 } 7889 #endif /* KMP_DEBUG */ 7890 7891 /* release the worker threads so they may begin working */ 7892 __kmp_fork_barrier(gtid, 0); 7893 } 7894 7895 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7896 kmp_info_t *this_thr = __kmp_threads[gtid]; 7897 7898 KMP_DEBUG_ASSERT(team); 7899 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7900 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7901 KMP_MB(); /* Flush all pending memory write invalidates. */ 7902 7903 /* Join barrier after fork */ 7904 7905 #ifdef KMP_DEBUG 7906 if (__kmp_threads[gtid] && 7907 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7908 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7909 __kmp_threads[gtid]); 7910 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7911 "team->t.t_nproc=%d\n", 7912 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7913 team->t.t_nproc); 7914 __kmp_print_structure(); 7915 } 7916 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7917 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7918 #endif /* KMP_DEBUG */ 7919 7920 __kmp_join_barrier(gtid); /* wait for everyone */ 7921 #if OMPT_SUPPORT 7922 if (ompt_enabled.enabled && 7923 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7924 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7925 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7926 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7927 #if OMPT_OPTIONAL 7928 void *codeptr = NULL; 7929 if (KMP_MASTER_TID(ds_tid) && 7930 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7931 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7932 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7933 7934 if (ompt_enabled.ompt_callback_sync_region_wait) { 7935 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7936 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7937 codeptr); 7938 } 7939 if (ompt_enabled.ompt_callback_sync_region) { 7940 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7941 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7942 codeptr); 7943 } 7944 #endif 7945 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7946 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7947 ompt_scope_end, NULL, task_data, 0, ds_tid, 7948 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7949 } 7950 } 7951 #endif 7952 7953 KMP_MB(); /* Flush all pending memory write invalidates. */ 7954 KMP_ASSERT(this_thr->th.th_team == team); 7955 } 7956 7957 /* ------------------------------------------------------------------------ */ 7958 7959 #ifdef USE_LOAD_BALANCE 7960 7961 // Return the worker threads actively spinning in the hot team, if we 7962 // are at the outermost level of parallelism. Otherwise, return 0. 7963 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7964 int i; 7965 int retval; 7966 kmp_team_t *hot_team; 7967 7968 if (root->r.r_active) { 7969 return 0; 7970 } 7971 hot_team = root->r.r_hot_team; 7972 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7973 return hot_team->t.t_nproc - 1; // Don't count primary thread 7974 } 7975 7976 // Skip the primary thread - it is accounted for elsewhere. 7977 retval = 0; 7978 for (i = 1; i < hot_team->t.t_nproc; i++) { 7979 if (hot_team->t.t_threads[i]->th.th_active) { 7980 retval++; 7981 } 7982 } 7983 return retval; 7984 } 7985 7986 // Perform an automatic adjustment to the number of 7987 // threads used by the next parallel region. 7988 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7989 int retval; 7990 int pool_active; 7991 int hot_team_active; 7992 int team_curr_active; 7993 int system_active; 7994 7995 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7996 set_nproc)); 7997 KMP_DEBUG_ASSERT(root); 7998 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7999 ->th.th_current_task->td_icvs.dynamic == TRUE); 8000 KMP_DEBUG_ASSERT(set_nproc > 1); 8001 8002 if (set_nproc == 1) { 8003 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 8004 return 1; 8005 } 8006 8007 // Threads that are active in the thread pool, active in the hot team for this 8008 // particular root (if we are at the outer par level), and the currently 8009 // executing thread (to become the primary thread) are available to add to the 8010 // new team, but are currently contributing to the system load, and must be 8011 // accounted for. 8012 pool_active = __kmp_thread_pool_active_nth; 8013 hot_team_active = __kmp_active_hot_team_nproc(root); 8014 team_curr_active = pool_active + hot_team_active + 1; 8015 8016 // Check the system load. 8017 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 8018 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 8019 "hot team active = %d\n", 8020 system_active, pool_active, hot_team_active)); 8021 8022 if (system_active < 0) { 8023 // There was an error reading the necessary info from /proc, so use the 8024 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 8025 // = dynamic_thread_limit, we shouldn't wind up getting back here. 8026 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 8027 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 8028 8029 // Make this call behave like the thread limit algorithm. 8030 retval = __kmp_avail_proc - __kmp_nth + 8031 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 8032 if (retval > set_nproc) { 8033 retval = set_nproc; 8034 } 8035 if (retval < KMP_MIN_NTH) { 8036 retval = KMP_MIN_NTH; 8037 } 8038 8039 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 8040 retval)); 8041 return retval; 8042 } 8043 8044 // There is a slight delay in the load balance algorithm in detecting new 8045 // running procs. The real system load at this instant should be at least as 8046 // large as the #active omp thread that are available to add to the team. 8047 if (system_active < team_curr_active) { 8048 system_active = team_curr_active; 8049 } 8050 retval = __kmp_avail_proc - system_active + team_curr_active; 8051 if (retval > set_nproc) { 8052 retval = set_nproc; 8053 } 8054 if (retval < KMP_MIN_NTH) { 8055 retval = KMP_MIN_NTH; 8056 } 8057 8058 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 8059 return retval; 8060 } // __kmp_load_balance_nproc() 8061 8062 #endif /* USE_LOAD_BALANCE */ 8063 8064 /* ------------------------------------------------------------------------ */ 8065 8066 /* NOTE: this is called with the __kmp_init_lock held */ 8067 void __kmp_cleanup(void) { 8068 int f; 8069 8070 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 8071 8072 if (TCR_4(__kmp_init_parallel)) { 8073 #if KMP_HANDLE_SIGNALS 8074 __kmp_remove_signals(); 8075 #endif 8076 TCW_4(__kmp_init_parallel, FALSE); 8077 } 8078 8079 if (TCR_4(__kmp_init_middle)) { 8080 #if KMP_AFFINITY_SUPPORTED 8081 __kmp_affinity_uninitialize(); 8082 #endif /* KMP_AFFINITY_SUPPORTED */ 8083 __kmp_cleanup_hierarchy(); 8084 TCW_4(__kmp_init_middle, FALSE); 8085 } 8086 8087 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 8088 8089 if (__kmp_init_serial) { 8090 __kmp_runtime_destroy(); 8091 __kmp_init_serial = FALSE; 8092 } 8093 8094 __kmp_cleanup_threadprivate_caches(); 8095 8096 for (f = 0; f < __kmp_threads_capacity; f++) { 8097 if (__kmp_root[f] != NULL) { 8098 __kmp_free(__kmp_root[f]); 8099 __kmp_root[f] = NULL; 8100 } 8101 } 8102 __kmp_free(__kmp_threads); 8103 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8104 // there is no need in freeing __kmp_root. 8105 __kmp_threads = NULL; 8106 __kmp_root = NULL; 8107 __kmp_threads_capacity = 0; 8108 8109 // Free old __kmp_threads arrays if they exist. 8110 kmp_old_threads_list_t *ptr = __kmp_old_threads_list; 8111 while (ptr) { 8112 kmp_old_threads_list_t *next = ptr->next; 8113 __kmp_free(ptr->threads); 8114 __kmp_free(ptr); 8115 ptr = next; 8116 } 8117 8118 #if KMP_USE_DYNAMIC_LOCK 8119 __kmp_cleanup_indirect_user_locks(); 8120 #else 8121 __kmp_cleanup_user_locks(); 8122 #endif 8123 #if OMPD_SUPPORT 8124 if (ompd_state) { 8125 __kmp_free(ompd_env_block); 8126 ompd_env_block = NULL; 8127 ompd_env_block_size = 0; 8128 } 8129 #endif 8130 8131 #if KMP_AFFINITY_SUPPORTED 8132 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8133 __kmp_cpuinfo_file = NULL; 8134 #endif /* KMP_AFFINITY_SUPPORTED */ 8135 8136 #if KMP_USE_ADAPTIVE_LOCKS 8137 #if KMP_DEBUG_ADAPTIVE_LOCKS 8138 __kmp_print_speculative_stats(); 8139 #endif 8140 #endif 8141 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8142 __kmp_nested_nth.nth = NULL; 8143 __kmp_nested_nth.size = 0; 8144 __kmp_nested_nth.used = 0; 8145 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8146 __kmp_nested_proc_bind.bind_types = NULL; 8147 __kmp_nested_proc_bind.size = 0; 8148 __kmp_nested_proc_bind.used = 0; 8149 if (__kmp_affinity_format) { 8150 KMP_INTERNAL_FREE(__kmp_affinity_format); 8151 __kmp_affinity_format = NULL; 8152 } 8153 8154 __kmp_i18n_catclose(); 8155 8156 #if KMP_USE_HIER_SCHED 8157 __kmp_hier_scheds.deallocate(); 8158 #endif 8159 8160 #if KMP_STATS_ENABLED 8161 __kmp_stats_fini(); 8162 #endif 8163 8164 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8165 } 8166 8167 /* ------------------------------------------------------------------------ */ 8168 8169 int __kmp_ignore_mppbeg(void) { 8170 char *env; 8171 8172 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8173 if (__kmp_str_match_false(env)) 8174 return FALSE; 8175 } 8176 // By default __kmpc_begin() is no-op. 8177 return TRUE; 8178 } 8179 8180 int __kmp_ignore_mppend(void) { 8181 char *env; 8182 8183 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8184 if (__kmp_str_match_false(env)) 8185 return FALSE; 8186 } 8187 // By default __kmpc_end() is no-op. 8188 return TRUE; 8189 } 8190 8191 void __kmp_internal_begin(void) { 8192 int gtid; 8193 kmp_root_t *root; 8194 8195 /* this is a very important step as it will register new sibling threads 8196 and assign these new uber threads a new gtid */ 8197 gtid = __kmp_entry_gtid(); 8198 root = __kmp_threads[gtid]->th.th_root; 8199 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8200 8201 if (root->r.r_begin) 8202 return; 8203 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8204 if (root->r.r_begin) { 8205 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8206 return; 8207 } 8208 8209 root->r.r_begin = TRUE; 8210 8211 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8212 } 8213 8214 /* ------------------------------------------------------------------------ */ 8215 8216 void __kmp_user_set_library(enum library_type arg) { 8217 int gtid; 8218 kmp_root_t *root; 8219 kmp_info_t *thread; 8220 8221 /* first, make sure we are initialized so we can get our gtid */ 8222 8223 gtid = __kmp_entry_gtid(); 8224 thread = __kmp_threads[gtid]; 8225 8226 root = thread->th.th_root; 8227 8228 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8229 library_serial)); 8230 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8231 thread */ 8232 KMP_WARNING(SetLibraryIncorrectCall); 8233 return; 8234 } 8235 8236 switch (arg) { 8237 case library_serial: 8238 thread->th.th_set_nproc = 0; 8239 set__nproc(thread, 1); 8240 break; 8241 case library_turnaround: 8242 thread->th.th_set_nproc = 0; 8243 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8244 : __kmp_dflt_team_nth_ub); 8245 break; 8246 case library_throughput: 8247 thread->th.th_set_nproc = 0; 8248 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8249 : __kmp_dflt_team_nth_ub); 8250 break; 8251 default: 8252 KMP_FATAL(UnknownLibraryType, arg); 8253 } 8254 8255 __kmp_aux_set_library(arg); 8256 } 8257 8258 void __kmp_aux_set_stacksize(size_t arg) { 8259 if (!__kmp_init_serial) 8260 __kmp_serial_initialize(); 8261 8262 #if KMP_OS_DARWIN 8263 if (arg & (0x1000 - 1)) { 8264 arg &= ~(0x1000 - 1); 8265 if (arg + 0x1000) /* check for overflow if we round up */ 8266 arg += 0x1000; 8267 } 8268 #endif 8269 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8270 8271 /* only change the default stacksize before the first parallel region */ 8272 if (!TCR_4(__kmp_init_parallel)) { 8273 size_t value = arg; /* argument is in bytes */ 8274 8275 if (value < __kmp_sys_min_stksize) 8276 value = __kmp_sys_min_stksize; 8277 else if (value > KMP_MAX_STKSIZE) 8278 value = KMP_MAX_STKSIZE; 8279 8280 __kmp_stksize = value; 8281 8282 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8283 } 8284 8285 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8286 } 8287 8288 /* set the behaviour of the runtime library */ 8289 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8290 void __kmp_aux_set_library(enum library_type arg) { 8291 __kmp_library = arg; 8292 8293 switch (__kmp_library) { 8294 case library_serial: { 8295 KMP_INFORM(LibraryIsSerial); 8296 } break; 8297 case library_turnaround: 8298 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8299 __kmp_use_yield = 2; // only yield when oversubscribed 8300 break; 8301 case library_throughput: 8302 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8303 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 8304 break; 8305 default: 8306 KMP_FATAL(UnknownLibraryType, arg); 8307 } 8308 } 8309 8310 /* Getting team information common for all team API */ 8311 // Returns NULL if not in teams construct 8312 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8313 kmp_info_t *thr = __kmp_entry_thread(); 8314 teams_serialized = 0; 8315 if (thr->th.th_teams_microtask) { 8316 kmp_team_t *team = thr->th.th_team; 8317 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8318 int ii = team->t.t_level; 8319 teams_serialized = team->t.t_serialized; 8320 int level = tlevel + 1; 8321 KMP_DEBUG_ASSERT(ii >= tlevel); 8322 while (ii > level) { 8323 for (teams_serialized = team->t.t_serialized; 8324 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8325 } 8326 if (team->t.t_serialized && (!teams_serialized)) { 8327 team = team->t.t_parent; 8328 continue; 8329 } 8330 if (ii > level) { 8331 team = team->t.t_parent; 8332 ii--; 8333 } 8334 } 8335 return team; 8336 } 8337 return NULL; 8338 } 8339 8340 int __kmp_aux_get_team_num() { 8341 int serialized; 8342 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8343 if (team) { 8344 if (serialized > 1) { 8345 return 0; // teams region is serialized ( 1 team of 1 thread ). 8346 } else { 8347 return team->t.t_master_tid; 8348 } 8349 } 8350 return 0; 8351 } 8352 8353 int __kmp_aux_get_num_teams() { 8354 int serialized; 8355 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8356 if (team) { 8357 if (serialized > 1) { 8358 return 1; 8359 } else { 8360 return team->t.t_parent->t.t_nproc; 8361 } 8362 } 8363 return 1; 8364 } 8365 8366 /* ------------------------------------------------------------------------ */ 8367 8368 /* 8369 * Affinity Format Parser 8370 * 8371 * Field is in form of: %[[[0].]size]type 8372 * % and type are required (%% means print a literal '%') 8373 * type is either single char or long name surrounded by {}, 8374 * e.g., N or {num_threads} 8375 * 0 => leading zeros 8376 * . => right justified when size is specified 8377 * by default output is left justified 8378 * size is the *minimum* field length 8379 * All other characters are printed as is 8380 * 8381 * Available field types: 8382 * L {thread_level} - omp_get_level() 8383 * n {thread_num} - omp_get_thread_num() 8384 * h {host} - name of host machine 8385 * P {process_id} - process id (integer) 8386 * T {thread_identifier} - native thread identifier (integer) 8387 * N {num_threads} - omp_get_num_threads() 8388 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8389 * a {thread_affinity} - comma separated list of integers or integer ranges 8390 * (values of affinity mask) 8391 * 8392 * Implementation-specific field types can be added 8393 * If a type is unknown, print "undefined" 8394 */ 8395 8396 // Structure holding the short name, long name, and corresponding data type 8397 // for snprintf. A table of these will represent the entire valid keyword 8398 // field types. 8399 typedef struct kmp_affinity_format_field_t { 8400 char short_name; // from spec e.g., L -> thread level 8401 const char *long_name; // from spec thread_level -> thread level 8402 char field_format; // data type for snprintf (typically 'd' or 's' 8403 // for integer or string) 8404 } kmp_affinity_format_field_t; 8405 8406 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8407 #if KMP_AFFINITY_SUPPORTED 8408 {'A', "thread_affinity", 's'}, 8409 #endif 8410 {'t', "team_num", 'd'}, 8411 {'T', "num_teams", 'd'}, 8412 {'L', "nesting_level", 'd'}, 8413 {'n', "thread_num", 'd'}, 8414 {'N', "num_threads", 'd'}, 8415 {'a', "ancestor_tnum", 'd'}, 8416 {'H', "host", 's'}, 8417 {'P', "process_id", 'd'}, 8418 {'i', "native_thread_id", 'd'}}; 8419 8420 // Return the number of characters it takes to hold field 8421 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8422 const char **ptr, 8423 kmp_str_buf_t *field_buffer) { 8424 int rc, format_index, field_value; 8425 const char *width_left, *width_right; 8426 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8427 static const int FORMAT_SIZE = 20; 8428 char format[FORMAT_SIZE] = {0}; 8429 char absolute_short_name = 0; 8430 8431 KMP_DEBUG_ASSERT(gtid >= 0); 8432 KMP_DEBUG_ASSERT(th); 8433 KMP_DEBUG_ASSERT(**ptr == '%'); 8434 KMP_DEBUG_ASSERT(field_buffer); 8435 8436 __kmp_str_buf_clear(field_buffer); 8437 8438 // Skip the initial % 8439 (*ptr)++; 8440 8441 // Check for %% first 8442 if (**ptr == '%') { 8443 __kmp_str_buf_cat(field_buffer, "%", 1); 8444 (*ptr)++; // skip over the second % 8445 return 1; 8446 } 8447 8448 // Parse field modifiers if they are present 8449 pad_zeros = false; 8450 if (**ptr == '0') { 8451 pad_zeros = true; 8452 (*ptr)++; // skip over 0 8453 } 8454 right_justify = false; 8455 if (**ptr == '.') { 8456 right_justify = true; 8457 (*ptr)++; // skip over . 8458 } 8459 // Parse width of field: [width_left, width_right) 8460 width_left = width_right = NULL; 8461 if (**ptr >= '0' && **ptr <= '9') { 8462 width_left = *ptr; 8463 SKIP_DIGITS(*ptr); 8464 width_right = *ptr; 8465 } 8466 8467 // Create the format for KMP_SNPRINTF based on flags parsed above 8468 format_index = 0; 8469 format[format_index++] = '%'; 8470 if (!right_justify) 8471 format[format_index++] = '-'; 8472 if (pad_zeros) 8473 format[format_index++] = '0'; 8474 if (width_left && width_right) { 8475 int i = 0; 8476 // Only allow 8 digit number widths. 8477 // This also prevents overflowing format variable 8478 while (i < 8 && width_left < width_right) { 8479 format[format_index++] = *width_left; 8480 width_left++; 8481 i++; 8482 } 8483 } 8484 8485 // Parse a name (long or short) 8486 // Canonicalize the name into absolute_short_name 8487 found_valid_name = false; 8488 parse_long_name = (**ptr == '{'); 8489 if (parse_long_name) 8490 (*ptr)++; // skip initial left brace 8491 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8492 sizeof(__kmp_affinity_format_table[0]); 8493 ++i) { 8494 char short_name = __kmp_affinity_format_table[i].short_name; 8495 const char *long_name = __kmp_affinity_format_table[i].long_name; 8496 char field_format = __kmp_affinity_format_table[i].field_format; 8497 if (parse_long_name) { 8498 size_t length = KMP_STRLEN(long_name); 8499 if (strncmp(*ptr, long_name, length) == 0) { 8500 found_valid_name = true; 8501 (*ptr) += length; // skip the long name 8502 } 8503 } else if (**ptr == short_name) { 8504 found_valid_name = true; 8505 (*ptr)++; // skip the short name 8506 } 8507 if (found_valid_name) { 8508 format[format_index++] = field_format; 8509 format[format_index++] = '\0'; 8510 absolute_short_name = short_name; 8511 break; 8512 } 8513 } 8514 if (parse_long_name) { 8515 if (**ptr != '}') { 8516 absolute_short_name = 0; 8517 } else { 8518 (*ptr)++; // skip over the right brace 8519 } 8520 } 8521 8522 // Attempt to fill the buffer with the requested 8523 // value using snprintf within __kmp_str_buf_print() 8524 switch (absolute_short_name) { 8525 case 't': 8526 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8527 break; 8528 case 'T': 8529 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8530 break; 8531 case 'L': 8532 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8533 break; 8534 case 'n': 8535 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8536 break; 8537 case 'H': { 8538 static const int BUFFER_SIZE = 256; 8539 char buf[BUFFER_SIZE]; 8540 __kmp_expand_host_name(buf, BUFFER_SIZE); 8541 rc = __kmp_str_buf_print(field_buffer, format, buf); 8542 } break; 8543 case 'P': 8544 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8545 break; 8546 case 'i': 8547 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8548 break; 8549 case 'N': 8550 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8551 break; 8552 case 'a': 8553 field_value = 8554 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8555 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8556 break; 8557 #if KMP_AFFINITY_SUPPORTED 8558 case 'A': { 8559 kmp_str_buf_t buf; 8560 __kmp_str_buf_init(&buf); 8561 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8562 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8563 __kmp_str_buf_free(&buf); 8564 } break; 8565 #endif 8566 default: 8567 // According to spec, If an implementation does not have info for field 8568 // type, then "undefined" is printed 8569 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8570 // Skip the field 8571 if (parse_long_name) { 8572 SKIP_TOKEN(*ptr); 8573 if (**ptr == '}') 8574 (*ptr)++; 8575 } else { 8576 (*ptr)++; 8577 } 8578 } 8579 8580 KMP_ASSERT(format_index <= FORMAT_SIZE); 8581 return rc; 8582 } 8583 8584 /* 8585 * Return number of characters needed to hold the affinity string 8586 * (not including null byte character) 8587 * The resultant string is printed to buffer, which the caller can then 8588 * handle afterwards 8589 */ 8590 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8591 kmp_str_buf_t *buffer) { 8592 const char *parse_ptr; 8593 size_t retval; 8594 const kmp_info_t *th; 8595 kmp_str_buf_t field; 8596 8597 KMP_DEBUG_ASSERT(buffer); 8598 KMP_DEBUG_ASSERT(gtid >= 0); 8599 8600 __kmp_str_buf_init(&field); 8601 __kmp_str_buf_clear(buffer); 8602 8603 th = __kmp_threads[gtid]; 8604 retval = 0; 8605 8606 // If format is NULL or zero-length string, then we use 8607 // affinity-format-var ICV 8608 parse_ptr = format; 8609 if (parse_ptr == NULL || *parse_ptr == '\0') { 8610 parse_ptr = __kmp_affinity_format; 8611 } 8612 KMP_DEBUG_ASSERT(parse_ptr); 8613 8614 while (*parse_ptr != '\0') { 8615 // Parse a field 8616 if (*parse_ptr == '%') { 8617 // Put field in the buffer 8618 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8619 __kmp_str_buf_catbuf(buffer, &field); 8620 retval += rc; 8621 } else { 8622 // Put literal character in buffer 8623 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8624 retval++; 8625 parse_ptr++; 8626 } 8627 } 8628 __kmp_str_buf_free(&field); 8629 return retval; 8630 } 8631 8632 // Displays the affinity string to stdout 8633 void __kmp_aux_display_affinity(int gtid, const char *format) { 8634 kmp_str_buf_t buf; 8635 __kmp_str_buf_init(&buf); 8636 __kmp_aux_capture_affinity(gtid, format, &buf); 8637 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8638 __kmp_str_buf_free(&buf); 8639 } 8640 8641 /* ------------------------------------------------------------------------ */ 8642 8643 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8644 int blocktime = arg; /* argument is in milliseconds */ 8645 #if KMP_USE_MONITOR 8646 int bt_intervals; 8647 #endif 8648 kmp_int8 bt_set; 8649 8650 __kmp_save_internal_controls(thread); 8651 8652 /* Normalize and set blocktime for the teams */ 8653 if (blocktime < KMP_MIN_BLOCKTIME) 8654 blocktime = KMP_MIN_BLOCKTIME; 8655 else if (blocktime > KMP_MAX_BLOCKTIME) 8656 blocktime = KMP_MAX_BLOCKTIME; 8657 8658 set__blocktime_team(thread->th.th_team, tid, blocktime); 8659 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8660 8661 #if KMP_USE_MONITOR 8662 /* Calculate and set blocktime intervals for the teams */ 8663 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8664 8665 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8666 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8667 #endif 8668 8669 /* Set whether blocktime has been set to "TRUE" */ 8670 bt_set = TRUE; 8671 8672 set__bt_set_team(thread->th.th_team, tid, bt_set); 8673 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8674 #if KMP_USE_MONITOR 8675 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8676 "bt_intervals=%d, monitor_updates=%d\n", 8677 __kmp_gtid_from_tid(tid, thread->th.th_team), 8678 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8679 __kmp_monitor_wakeups)); 8680 #else 8681 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8682 __kmp_gtid_from_tid(tid, thread->th.th_team), 8683 thread->th.th_team->t.t_id, tid, blocktime)); 8684 #endif 8685 } 8686 8687 void __kmp_aux_set_defaults(char const *str, size_t len) { 8688 if (!__kmp_init_serial) { 8689 __kmp_serial_initialize(); 8690 } 8691 __kmp_env_initialize(str); 8692 8693 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8694 __kmp_env_print(); 8695 } 8696 } // __kmp_aux_set_defaults 8697 8698 /* ------------------------------------------------------------------------ */ 8699 /* internal fast reduction routines */ 8700 8701 PACKED_REDUCTION_METHOD_T 8702 __kmp_determine_reduction_method( 8703 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8704 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8705 kmp_critical_name *lck) { 8706 8707 // Default reduction method: critical construct ( lck != NULL, like in current 8708 // PAROPT ) 8709 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8710 // can be selected by RTL 8711 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8712 // can be selected by RTL 8713 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8714 // among generated by PAROPT. 8715 8716 PACKED_REDUCTION_METHOD_T retval; 8717 8718 int team_size; 8719 8720 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8721 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8722 8723 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8724 (loc && \ 8725 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))) 8726 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8727 8728 retval = critical_reduce_block; 8729 8730 // another choice of getting a team size (with 1 dynamic deference) is slower 8731 team_size = __kmp_get_team_num_threads(global_tid); 8732 if (team_size == 1) { 8733 8734 retval = empty_reduce_block; 8735 8736 } else { 8737 8738 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8739 8740 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8741 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8742 8743 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8744 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8745 8746 int teamsize_cutoff = 4; 8747 8748 #if KMP_MIC_SUPPORTED 8749 if (__kmp_mic_type != non_mic) { 8750 teamsize_cutoff = 8; 8751 } 8752 #endif 8753 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8754 if (tree_available) { 8755 if (team_size <= teamsize_cutoff) { 8756 if (atomic_available) { 8757 retval = atomic_reduce_block; 8758 } 8759 } else { 8760 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8761 } 8762 } else if (atomic_available) { 8763 retval = atomic_reduce_block; 8764 } 8765 #else 8766 #error "Unknown or unsupported OS" 8767 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8768 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8769 8770 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8771 8772 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8773 8774 // basic tuning 8775 8776 if (atomic_available) { 8777 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8778 retval = atomic_reduce_block; 8779 } 8780 } // otherwise: use critical section 8781 8782 #elif KMP_OS_DARWIN 8783 8784 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8785 if (atomic_available && (num_vars <= 3)) { 8786 retval = atomic_reduce_block; 8787 } else if (tree_available) { 8788 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8789 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8790 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8791 } 8792 } // otherwise: use critical section 8793 8794 #else 8795 #error "Unknown or unsupported OS" 8796 #endif 8797 8798 #else 8799 #error "Unknown or unsupported architecture" 8800 #endif 8801 } 8802 8803 // KMP_FORCE_REDUCTION 8804 8805 // If the team is serialized (team_size == 1), ignore the forced reduction 8806 // method and stay with the unsynchronized method (empty_reduce_block) 8807 if (__kmp_force_reduction_method != reduction_method_not_defined && 8808 team_size != 1) { 8809 8810 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8811 8812 int atomic_available, tree_available; 8813 8814 switch ((forced_retval = __kmp_force_reduction_method)) { 8815 case critical_reduce_block: 8816 KMP_ASSERT(lck); // lck should be != 0 8817 break; 8818 8819 case atomic_reduce_block: 8820 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8821 if (!atomic_available) { 8822 KMP_WARNING(RedMethodNotSupported, "atomic"); 8823 forced_retval = critical_reduce_block; 8824 } 8825 break; 8826 8827 case tree_reduce_block: 8828 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8829 if (!tree_available) { 8830 KMP_WARNING(RedMethodNotSupported, "tree"); 8831 forced_retval = critical_reduce_block; 8832 } else { 8833 #if KMP_FAST_REDUCTION_BARRIER 8834 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8835 #endif 8836 } 8837 break; 8838 8839 default: 8840 KMP_ASSERT(0); // "unsupported method specified" 8841 } 8842 8843 retval = forced_retval; 8844 } 8845 8846 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8847 8848 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8849 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8850 8851 return (retval); 8852 } 8853 // this function is for testing set/get/determine reduce method 8854 kmp_int32 __kmp_get_reduce_method(void) { 8855 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8856 } 8857 8858 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8859 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8860 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8861 8862 // Hard pause shuts down the runtime completely. Resume happens naturally when 8863 // OpenMP is used subsequently. 8864 void __kmp_hard_pause() { 8865 __kmp_pause_status = kmp_hard_paused; 8866 __kmp_internal_end_thread(-1); 8867 } 8868 8869 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8870 void __kmp_resume_if_soft_paused() { 8871 if (__kmp_pause_status == kmp_soft_paused) { 8872 __kmp_pause_status = kmp_not_paused; 8873 8874 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8875 kmp_info_t *thread = __kmp_threads[gtid]; 8876 if (thread) { // Wake it if sleeping 8877 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8878 thread); 8879 if (fl.is_sleeping()) 8880 fl.resume(gtid); 8881 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8882 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8883 } else { // thread holds the lock and may sleep soon 8884 do { // until either the thread sleeps, or we can get the lock 8885 if (fl.is_sleeping()) { 8886 fl.resume(gtid); 8887 break; 8888 } else if (__kmp_try_suspend_mx(thread)) { 8889 __kmp_unlock_suspend_mx(thread); 8890 break; 8891 } 8892 } while (1); 8893 } 8894 } 8895 } 8896 } 8897 } 8898 8899 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8900 // TODO: add warning messages 8901 int __kmp_pause_resource(kmp_pause_status_t level) { 8902 if (level == kmp_not_paused) { // requesting resume 8903 if (__kmp_pause_status == kmp_not_paused) { 8904 // error message about runtime not being paused, so can't resume 8905 return 1; 8906 } else { 8907 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8908 __kmp_pause_status == kmp_hard_paused); 8909 __kmp_pause_status = kmp_not_paused; 8910 return 0; 8911 } 8912 } else if (level == kmp_soft_paused) { // requesting soft pause 8913 if (__kmp_pause_status != kmp_not_paused) { 8914 // error message about already being paused 8915 return 1; 8916 } else { 8917 __kmp_soft_pause(); 8918 return 0; 8919 } 8920 } else if (level == kmp_hard_paused) { // requesting hard pause 8921 if (__kmp_pause_status != kmp_not_paused) { 8922 // error message about already being paused 8923 return 1; 8924 } else { 8925 __kmp_hard_pause(); 8926 return 0; 8927 } 8928 } else { 8929 // error message about invalid level 8930 return 1; 8931 } 8932 } 8933 8934 void __kmp_omp_display_env(int verbose) { 8935 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8936 if (__kmp_init_serial == 0) 8937 __kmp_do_serial_initialize(); 8938 __kmp_display_env_impl(!verbose, verbose); 8939 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8940 } 8941 8942 // The team size is changing, so distributed barrier must be modified 8943 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 8944 int new_nthreads) { 8945 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 8946 bp_dist_bar); 8947 kmp_info_t **other_threads = team->t.t_threads; 8948 8949 // We want all the workers to stop waiting on the barrier while we adjust the 8950 // size of the team. 8951 for (int f = 1; f < old_nthreads; ++f) { 8952 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 8953 // Ignore threads that are already inactive or not present in the team 8954 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 8955 // teams construct causes thread_limit to get passed in, and some of 8956 // those could be inactive; just ignore them 8957 continue; 8958 } 8959 // If thread is transitioning still to in_use state, wait for it 8960 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 8961 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 8962 KMP_CPU_PAUSE(); 8963 } 8964 // The thread should be in_use now 8965 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 8966 // Transition to unused state 8967 team->t.t_threads[f]->th.th_used_in_team.store(2); 8968 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 8969 } 8970 // Release all the workers 8971 team->t.b->go_release(); 8972 8973 KMP_MFENCE(); 8974 8975 // Workers should see transition status 2 and move to 0; but may need to be 8976 // woken up first 8977 int count = old_nthreads - 1; 8978 while (count > 0) { 8979 count = old_nthreads - 1; 8980 for (int f = 1; f < old_nthreads; ++f) { 8981 if (other_threads[f]->th.th_used_in_team.load() != 0) { 8982 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 8983 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 8984 void *, other_threads[f]->th.th_sleep_loc); 8985 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 8986 } 8987 } else { 8988 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 8989 count--; 8990 } 8991 } 8992 } 8993 // Now update the barrier size 8994 team->t.b->update_num_threads(new_nthreads); 8995 team->t.b->go_reset(); 8996 } 8997 8998 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 8999 // Add the threads back to the team 9000 KMP_DEBUG_ASSERT(team); 9001 // Threads were paused and pointed at th_used_in_team temporarily during a 9002 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 9003 // the thread that it should transition itself back into the team. Then, if 9004 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 9005 // to wake it up. 9006 for (int f = 1; f < new_nthreads; ++f) { 9007 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 9008 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 9009 3); 9010 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 9011 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 9012 (kmp_flag_32<false, false> *)NULL); 9013 } 9014 } 9015 // The threads should be transitioning to the team; when they are done, they 9016 // should have set th_used_in_team to 1. This loop forces master to wait until 9017 // all threads have moved into the team and are waiting in the barrier. 9018 int count = new_nthreads - 1; 9019 while (count > 0) { 9020 count = new_nthreads - 1; 9021 for (int f = 1; f < new_nthreads; ++f) { 9022 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 9023 count--; 9024 } 9025 } 9026 } 9027 } 9028 9029 // Globals and functions for hidden helper task 9030 kmp_info_t **__kmp_hidden_helper_threads; 9031 kmp_info_t *__kmp_hidden_helper_main_thread; 9032 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 9033 #if KMP_OS_LINUX 9034 kmp_int32 __kmp_hidden_helper_threads_num = 8; 9035 kmp_int32 __kmp_enable_hidden_helper = TRUE; 9036 #else 9037 kmp_int32 __kmp_hidden_helper_threads_num = 0; 9038 kmp_int32 __kmp_enable_hidden_helper = FALSE; 9039 #endif 9040 9041 namespace { 9042 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 9043 9044 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 9045 // This is an explicit synchronization on all hidden helper threads in case 9046 // that when a regular thread pushes a hidden helper task to one hidden 9047 // helper thread, the thread has not been awaken once since they're released 9048 // by the main thread after creating the team. 9049 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 9050 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 9051 __kmp_hidden_helper_threads_num) 9052 ; 9053 9054 // If main thread, then wait for signal 9055 if (__kmpc_master(nullptr, *gtid)) { 9056 // First, unset the initial state and release the initial thread 9057 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 9058 __kmp_hidden_helper_initz_release(); 9059 __kmp_hidden_helper_main_thread_wait(); 9060 // Now wake up all worker threads 9061 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 9062 __kmp_hidden_helper_worker_thread_signal(); 9063 } 9064 } 9065 } 9066 } // namespace 9067 9068 void __kmp_hidden_helper_threads_initz_routine() { 9069 // Create a new root for hidden helper team/threads 9070 const int gtid = __kmp_register_root(TRUE); 9071 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 9072 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 9073 __kmp_hidden_helper_main_thread->th.th_set_nproc = 9074 __kmp_hidden_helper_threads_num; 9075 9076 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 9077 9078 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 9079 9080 // Set the initialization flag to FALSE 9081 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 9082 9083 __kmp_hidden_helper_threads_deinitz_release(); 9084 } 9085 9086 /* Nesting Mode: 9087 Set via KMP_NESTING_MODE, which takes an integer. 9088 Note: we skip duplicate topology levels, and skip levels with only 9089 one entity. 9090 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 9091 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 9092 in the topology, and initializes the number of threads at each of those 9093 levels to the number of entities at each level, respectively, below the 9094 entity at the parent level. 9095 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 9096 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 9097 the user to turn nesting on explicitly. This is an even more experimental 9098 option to this experimental feature, and may change or go away in the 9099 future. 9100 */ 9101 9102 // Allocate space to store nesting levels 9103 void __kmp_init_nesting_mode() { 9104 int levels = KMP_HW_LAST; 9105 __kmp_nesting_mode_nlevels = levels; 9106 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9107 for (int i = 0; i < levels; ++i) 9108 __kmp_nesting_nth_level[i] = 0; 9109 if (__kmp_nested_nth.size < levels) { 9110 __kmp_nested_nth.nth = 9111 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9112 __kmp_nested_nth.size = levels; 9113 } 9114 } 9115 9116 // Set # threads for top levels of nesting; must be called after topology set 9117 void __kmp_set_nesting_mode_threads() { 9118 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9119 9120 if (__kmp_nesting_mode == 1) 9121 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9122 else if (__kmp_nesting_mode > 1) 9123 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9124 9125 if (__kmp_topology) { // use topology info 9126 int loc, hw_level; 9127 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9128 loc < __kmp_nesting_mode_nlevels; 9129 loc++, hw_level++) { 9130 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9131 if (__kmp_nesting_nth_level[loc] == 1) 9132 loc--; 9133 } 9134 // Make sure all cores are used 9135 if (__kmp_nesting_mode > 1 && loc > 1) { 9136 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9137 int num_cores = __kmp_topology->get_count(core_level); 9138 int upper_levels = 1; 9139 for (int level = 0; level < loc - 1; ++level) 9140 upper_levels *= __kmp_nesting_nth_level[level]; 9141 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9142 __kmp_nesting_nth_level[loc - 1] = 9143 num_cores / __kmp_nesting_nth_level[loc - 2]; 9144 } 9145 __kmp_nesting_mode_nlevels = loc; 9146 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9147 } else { // no topology info available; provide a reasonable guesstimation 9148 if (__kmp_avail_proc >= 4) { 9149 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9150 __kmp_nesting_nth_level[1] = 2; 9151 __kmp_nesting_mode_nlevels = 2; 9152 } else { 9153 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9154 __kmp_nesting_mode_nlevels = 1; 9155 } 9156 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9157 } 9158 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9159 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9160 } 9161 set__nproc(thread, __kmp_nesting_nth_level[0]); 9162 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9163 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9164 if (get__max_active_levels(thread) > 1) { 9165 // if max levels was set, set nesting mode levels to same 9166 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9167 } 9168 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9169 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9170 } 9171