xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/team.c (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1 /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file handles the maintenance of threads in response to team
27    creation and termination.  */
28 
29 #include "libgomp.h"
30 #include "pool.h"
31 #include <stdlib.h>
32 #include <string.h>
33 
34 #ifdef LIBGOMP_USE_PTHREADS
35 pthread_attr_t gomp_thread_attr;
36 
37 /* This key is for the thread destructor.  */
38 pthread_key_t gomp_thread_destructor;
39 
40 
41 /* This is the libgomp per-thread data structure.  */
42 #if defined HAVE_TLS || defined USE_EMUTLS
43 __thread struct gomp_thread gomp_tls_data;
44 #else
45 pthread_key_t gomp_tls_key;
46 #endif
47 
48 
49 /* This structure is used to communicate across pthread_create.  */
50 
51 struct gomp_thread_start_data
52 {
53   void (*fn) (void *);
54   void *fn_data;
55   struct gomp_team_state ts;
56   struct gomp_task *task;
57   struct gomp_thread_pool *thread_pool;
58   unsigned int place;
59   bool nested;
60   pthread_t handle;
61 };
62 
63 
64 /* This function is a pthread_create entry point.  This contains the idle
65    loop in which a thread waits to be called up to become part of a team.  */
66 
67 static void *
68 gomp_thread_start (void *xdata)
69 {
70   struct gomp_thread_start_data *data = xdata;
71   struct gomp_thread *thr;
72   struct gomp_thread_pool *pool;
73   void (*local_fn) (void *);
74   void *local_data;
75 
76 #if defined HAVE_TLS || defined USE_EMUTLS
77   thr = &gomp_tls_data;
78 #else
79   struct gomp_thread local_thr;
80   thr = &local_thr;
81   pthread_setspecific (gomp_tls_key, thr);
82 #endif
83   gomp_sem_init (&thr->release, 0);
84 
85   /* Extract what we need from data.  */
86   local_fn = data->fn;
87   local_data = data->fn_data;
88   thr->thread_pool = data->thread_pool;
89   thr->ts = data->ts;
90   thr->task = data->task;
91   thr->place = data->place;
92 #ifdef GOMP_NEEDS_THREAD_HANDLE
93   thr->handle = data->handle;
94 #endif
95 
96   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
97 
98   /* Make thread pool local. */
99   pool = thr->thread_pool;
100 
101   if (data->nested)
102     {
103       struct gomp_team *team = thr->ts.team;
104       struct gomp_task *task = thr->task;
105 
106       gomp_barrier_wait (&team->barrier);
107 
108       local_fn (local_data);
109       gomp_team_barrier_wait_final (&team->barrier);
110       gomp_finish_task (task);
111       gomp_barrier_wait_last (&team->barrier);
112     }
113   else
114     {
115       pool->threads[thr->ts.team_id] = thr;
116 
117       gomp_simple_barrier_wait (&pool->threads_dock);
118       do
119 	{
120 	  struct gomp_team *team = thr->ts.team;
121 	  struct gomp_task *task = thr->task;
122 
123 	  local_fn (local_data);
124 	  gomp_team_barrier_wait_final (&team->barrier);
125 	  gomp_finish_task (task);
126 
127 	  gomp_simple_barrier_wait (&pool->threads_dock);
128 
129 	  local_fn = thr->fn;
130 	  local_data = thr->data;
131 	  thr->fn = NULL;
132 	}
133       while (local_fn);
134     }
135 
136   gomp_sem_destroy (&thr->release);
137   pthread_detach (pthread_self ());
138   thr->thread_pool = NULL;
139   thr->task = NULL;
140   return NULL;
141 }
142 #endif
143 
144 static inline struct gomp_team *
145 get_last_team (unsigned nthreads)
146 {
147   struct gomp_thread *thr = gomp_thread ();
148   if (thr->ts.team == NULL)
149     {
150       struct gomp_thread_pool *pool = gomp_get_thread_pool (thr, nthreads);
151       struct gomp_team *last_team = pool->last_team;
152       if (last_team != NULL && last_team->nthreads == nthreads)
153         {
154           pool->last_team = NULL;
155           return last_team;
156         }
157     }
158   return NULL;
159 }
160 
161 /* Create a new team data structure.  */
162 
163 struct gomp_team *
164 gomp_new_team (unsigned nthreads)
165 {
166   struct gomp_team *team;
167   int i;
168 
169   team = get_last_team (nthreads);
170   if (team == NULL)
171     {
172       size_t extra = sizeof (team->ordered_release[0])
173 		     + sizeof (team->implicit_task[0]);
174       team = team_malloc (sizeof (*team) + nthreads * extra);
175 
176 #ifndef HAVE_SYNC_BUILTINS
177       gomp_mutex_init (&team->work_share_list_free_lock);
178 #endif
179       gomp_barrier_init (&team->barrier, nthreads);
180       gomp_mutex_init (&team->task_lock);
181 
182       team->nthreads = nthreads;
183     }
184 
185   team->work_share_chunk = 8;
186 #ifdef HAVE_SYNC_BUILTINS
187   team->single_count = 0;
188 #endif
189   team->work_shares_to_free = &team->work_shares[0];
190   gomp_init_work_share (&team->work_shares[0], 0, nthreads);
191   team->work_shares[0].next_alloc = NULL;
192   team->work_share_list_free = NULL;
193   team->work_share_list_alloc = &team->work_shares[1];
194   for (i = 1; i < 7; i++)
195     team->work_shares[i].next_free = &team->work_shares[i + 1];
196   team->work_shares[i].next_free = NULL;
197 
198   gomp_sem_init (&team->master_release, 0);
199   team->ordered_release = (void *) &team->implicit_task[nthreads];
200   team->ordered_release[0] = &team->master_release;
201 
202   priority_queue_init (&team->task_queue);
203   team->task_count = 0;
204   team->task_queued_count = 0;
205   team->task_running_count = 0;
206   team->work_share_cancelled = 0;
207   team->team_cancelled = 0;
208 
209   return team;
210 }
211 
212 
213 /* Free a team data structure.  */
214 
215 static void
216 free_team (struct gomp_team *team)
217 {
218 #ifndef HAVE_SYNC_BUILTINS
219   gomp_mutex_destroy (&team->work_share_list_free_lock);
220 #endif
221   gomp_barrier_destroy (&team->barrier);
222   gomp_mutex_destroy (&team->task_lock);
223   priority_queue_free (&team->task_queue);
224   team_free (team);
225 }
226 
227 static void
228 gomp_free_pool_helper (void *thread_pool)
229 {
230   struct gomp_thread *thr = gomp_thread ();
231   struct gomp_thread_pool *pool
232     = (struct gomp_thread_pool *) thread_pool;
233   gomp_simple_barrier_wait_last (&pool->threads_dock);
234   gomp_sem_destroy (&thr->release);
235   thr->thread_pool = NULL;
236   thr->task = NULL;
237 #ifdef LIBGOMP_USE_PTHREADS
238   pthread_detach (pthread_self ());
239   pthread_exit (NULL);
240 #elif defined(__nvptx__)
241   asm ("exit;");
242 #elif defined(__AMDGCN__)
243   asm ("s_dcache_wb\n\t"
244        "s_endpgm");
245 #else
246 #error gomp_free_pool_helper must terminate the thread
247 #endif
248 }
249 
250 /* Free a thread pool and release its threads. */
251 
252 void
253 gomp_free_thread (void *arg __attribute__((unused)))
254 {
255   struct gomp_thread *thr = gomp_thread ();
256   struct gomp_thread_pool *pool = thr->thread_pool;
257   if (pool)
258     {
259       if (pool->threads_used > 0)
260 	{
261 	  int i;
262 	  for (i = 1; i < pool->threads_used; i++)
263 	    {
264 	      struct gomp_thread *nthr = pool->threads[i];
265 	      nthr->fn = gomp_free_pool_helper;
266 	      nthr->data = pool;
267 	    }
268 	  /* This barrier undocks threads docked on pool->threads_dock.  */
269 	  gomp_simple_barrier_wait (&pool->threads_dock);
270 	  /* And this waits till all threads have called gomp_barrier_wait_last
271 	     in gomp_free_pool_helper.  */
272 	  gomp_simple_barrier_wait (&pool->threads_dock);
273 	  /* Now it is safe to destroy the barrier and free the pool.  */
274 	  gomp_simple_barrier_destroy (&pool->threads_dock);
275 
276 #ifdef HAVE_SYNC_BUILTINS
277 	  __sync_fetch_and_add (&gomp_managed_threads,
278 				1L - pool->threads_used);
279 #else
280 	  gomp_mutex_lock (&gomp_managed_threads_lock);
281 	  gomp_managed_threads -= pool->threads_used - 1L;
282 	  gomp_mutex_unlock (&gomp_managed_threads_lock);
283 #endif
284 	}
285       if (pool->last_team)
286 	free_team (pool->last_team);
287 #ifndef __nvptx__
288       team_free (pool->threads);
289       team_free (pool);
290 #endif
291       thr->thread_pool = NULL;
292     }
293   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
294     gomp_team_end ();
295   if (thr->task != NULL)
296     {
297       struct gomp_task *task = thr->task;
298       gomp_end_task ();
299       free (task);
300     }
301 }
302 
303 /* Launch a team.  */
304 
305 #ifdef LIBGOMP_USE_PTHREADS
306 void
307 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
308 		 unsigned flags, struct gomp_team *team,
309 		 struct gomp_taskgroup *taskgroup)
310 {
311   struct gomp_thread_start_data *start_data;
312   struct gomp_thread *thr, *nthr;
313   struct gomp_task *task;
314   struct gomp_task_icv *icv;
315   bool nested;
316   struct gomp_thread_pool *pool;
317   unsigned i, n, old_threads_used = 0;
318   pthread_attr_t thread_attr, *attr;
319   unsigned long nthreads_var;
320   char bind, bind_var;
321   unsigned int s = 0, rest = 0, p = 0, k = 0;
322   unsigned int affinity_count = 0;
323   struct gomp_thread **affinity_thr = NULL;
324   bool force_display = false;
325 
326   thr = gomp_thread ();
327   nested = thr->ts.level;
328   pool = thr->thread_pool;
329   task = thr->task;
330   icv = task ? &task->icv : &gomp_global_icv;
331   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
332     {
333       gomp_init_affinity ();
334       if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
335 	gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
336 				      thr->place);
337     }
338 
339   /* Always save the previous state, even if this isn't a nested team.
340      In particular, we should save any work share state from an outer
341      orphaned work share construct.  */
342   team->prev_ts = thr->ts;
343 
344   thr->ts.team = team;
345   thr->ts.team_id = 0;
346   ++thr->ts.level;
347   if (nthreads > 1)
348     ++thr->ts.active_level;
349   thr->ts.work_share = &team->work_shares[0];
350   thr->ts.last_work_share = NULL;
351 #ifdef HAVE_SYNC_BUILTINS
352   thr->ts.single_count = 0;
353 #endif
354   thr->ts.static_trip = 0;
355   thr->task = &team->implicit_task[0];
356 #ifdef GOMP_NEEDS_THREAD_HANDLE
357   thr->handle = pthread_self ();
358 #endif
359   nthreads_var = icv->nthreads_var;
360   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
361       && thr->ts.level < gomp_nthreads_var_list_len)
362     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
363   bind_var = icv->bind_var;
364   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
365     bind_var = flags & 7;
366   bind = bind_var;
367   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
368       && thr->ts.level < gomp_bind_var_list_len)
369     bind_var = gomp_bind_var_list[thr->ts.level];
370   gomp_init_task (thr->task, task, icv);
371   thr->task->taskgroup = taskgroup;
372   team->implicit_task[0].icv.nthreads_var = nthreads_var;
373   team->implicit_task[0].icv.bind_var = bind_var;
374 
375   if (nthreads == 1)
376     return;
377 
378   i = 1;
379 
380   if (__builtin_expect (gomp_places_list != NULL, 0))
381     {
382       /* Depending on chosen proc_bind model, set subpartition
383 	 for the master thread and initialize helper variables
384 	 P and optionally S, K and/or REST used by later place
385 	 computation for each additional thread.  */
386       p = thr->place - 1;
387       switch (bind)
388 	{
389 	case omp_proc_bind_true:
390 	case omp_proc_bind_close:
391 	  if (nthreads > thr->ts.place_partition_len)
392 	    {
393 	      /* T > P.  S threads will be placed in each place,
394 		 and the final REM threads placed one by one
395 		 into the already occupied places.  */
396 	      s = nthreads / thr->ts.place_partition_len;
397 	      rest = nthreads % thr->ts.place_partition_len;
398 	    }
399 	  else
400 	    s = 1;
401 	  k = 1;
402 	  break;
403 	case omp_proc_bind_master:
404 	  /* Each thread will be bound to master's place.  */
405 	  break;
406 	case omp_proc_bind_spread:
407 	  if (nthreads <= thr->ts.place_partition_len)
408 	    {
409 	      /* T <= P.  Each subpartition will have in between s
410 		 and s+1 places (subpartitions starting at or
411 		 after rest will have s places, earlier s+1 places),
412 		 each thread will be bound to the first place in
413 		 its subpartition (except for the master thread
414 		 that can be bound to another place in its
415 		 subpartition).  */
416 	      s = thr->ts.place_partition_len / nthreads;
417 	      rest = thr->ts.place_partition_len % nthreads;
418 	      rest = (s + 1) * rest + thr->ts.place_partition_off;
419 	      if (p < rest)
420 		{
421 		  p -= (p - thr->ts.place_partition_off) % (s + 1);
422 		  thr->ts.place_partition_len = s + 1;
423 		}
424 	      else
425 		{
426 		  p -= (p - rest) % s;
427 		  thr->ts.place_partition_len = s;
428 		}
429 	      thr->ts.place_partition_off = p;
430 	    }
431 	  else
432 	    {
433 	      /* T > P.  Each subpartition will have just a single
434 		 place and we'll place between s and s+1
435 		 threads into each subpartition.  */
436 	      s = nthreads / thr->ts.place_partition_len;
437 	      rest = nthreads % thr->ts.place_partition_len;
438 	      thr->ts.place_partition_off = p;
439 	      thr->ts.place_partition_len = 1;
440 	      k = 1;
441 	    }
442 	  break;
443 	}
444     }
445   else
446     bind = omp_proc_bind_false;
447 
448   /* We only allow the reuse of idle threads for non-nested PARALLEL
449      regions.  This appears to be implied by the semantics of
450      threadprivate variables, but perhaps that's reading too much into
451      things.  Certainly it does prevent any locking problems, since
452      only the initial program thread will modify gomp_threads.  */
453   if (!nested)
454     {
455       old_threads_used = pool->threads_used;
456 
457       if (nthreads <= old_threads_used)
458 	n = nthreads;
459       else if (old_threads_used == 0)
460 	{
461 	  n = 0;
462 	  gomp_simple_barrier_init (&pool->threads_dock, nthreads);
463 	}
464       else
465 	{
466 	  n = old_threads_used;
467 
468 	  /* Increase the barrier threshold to make sure all new
469 	     threads arrive before the team is released.  */
470 	  gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
471 	}
472 
473       /* Not true yet, but soon will be.  We're going to release all
474 	 threads from the dock, and those that aren't part of the
475 	 team will exit.  */
476       pool->threads_used = nthreads;
477 
478       /* If necessary, expand the size of the gomp_threads array.  It is
479 	 expected that changes in the number of threads are rare, thus we
480 	 make no effort to expand gomp_threads_size geometrically.  */
481       if (nthreads >= pool->threads_size)
482 	{
483 	  pool->threads_size = nthreads + 1;
484 	  pool->threads
485 	    = gomp_realloc (pool->threads,
486 			    pool->threads_size
487 			    * sizeof (struct gomp_thread *));
488 	  /* Add current (master) thread to threads[].  */
489 	  pool->threads[0] = thr;
490 	}
491 
492       /* Release existing idle threads.  */
493       for (; i < n; ++i)
494 	{
495 	  unsigned int place_partition_off = thr->ts.place_partition_off;
496 	  unsigned int place_partition_len = thr->ts.place_partition_len;
497 	  unsigned int place = 0;
498 	  if (__builtin_expect (gomp_places_list != NULL, 0))
499 	    {
500 	      switch (bind)
501 		{
502 		case omp_proc_bind_true:
503 		case omp_proc_bind_close:
504 		  if (k == s)
505 		    {
506 		      ++p;
507 		      if (p == (team->prev_ts.place_partition_off
508 				+ team->prev_ts.place_partition_len))
509 			p = team->prev_ts.place_partition_off;
510 		      k = 1;
511 		      if (i == nthreads - rest)
512 			s = 1;
513 		    }
514 		  else
515 		    ++k;
516 		  break;
517 		case omp_proc_bind_master:
518 		  break;
519 		case omp_proc_bind_spread:
520 		  if (k == 0)
521 		    {
522 		      /* T <= P.  */
523 		      if (p < rest)
524 			p += s + 1;
525 		      else
526 			p += s;
527 		      if (p == (team->prev_ts.place_partition_off
528 				+ team->prev_ts.place_partition_len))
529 			p = team->prev_ts.place_partition_off;
530 		      place_partition_off = p;
531 		      if (p < rest)
532 			place_partition_len = s + 1;
533 		      else
534 			place_partition_len = s;
535 		    }
536 		  else
537 		    {
538 		      /* T > P.  */
539 		      if (k == s)
540 			{
541 			  ++p;
542 			  if (p == (team->prev_ts.place_partition_off
543 				    + team->prev_ts.place_partition_len))
544 			    p = team->prev_ts.place_partition_off;
545 			  k = 1;
546 			  if (i == nthreads - rest)
547 			    s = 1;
548 			}
549 		      else
550 			++k;
551 		      place_partition_off = p;
552 		      place_partition_len = 1;
553 		    }
554 		  break;
555 		}
556 	      if (affinity_thr != NULL
557 		  || (bind != omp_proc_bind_true
558 		      && pool->threads[i]->place != p + 1)
559 		  || pool->threads[i]->place <= place_partition_off
560 		  || pool->threads[i]->place > (place_partition_off
561 						+ place_partition_len))
562 		{
563 		  unsigned int l;
564 		  force_display = true;
565 		  if (affinity_thr == NULL)
566 		    {
567 		      unsigned int j;
568 
569 		      if (team->prev_ts.place_partition_len > 64)
570 			affinity_thr
571 			  = gomp_malloc (team->prev_ts.place_partition_len
572 					 * sizeof (struct gomp_thread *));
573 		      else
574 			affinity_thr
575 			  = gomp_alloca (team->prev_ts.place_partition_len
576 					 * sizeof (struct gomp_thread *));
577 		      memset (affinity_thr, '\0',
578 			      team->prev_ts.place_partition_len
579 			      * sizeof (struct gomp_thread *));
580 		      for (j = i; j < old_threads_used; j++)
581 			{
582 			  if (pool->threads[j]->place
583 			      > team->prev_ts.place_partition_off
584 			      && (pool->threads[j]->place
585 				  <= (team->prev_ts.place_partition_off
586 				      + team->prev_ts.place_partition_len)))
587 			    {
588 			      l = pool->threads[j]->place - 1
589 				  - team->prev_ts.place_partition_off;
590 			      pool->threads[j]->data = affinity_thr[l];
591 			      affinity_thr[l] = pool->threads[j];
592 			    }
593 			  pool->threads[j] = NULL;
594 			}
595 		      if (nthreads > old_threads_used)
596 			memset (&pool->threads[old_threads_used],
597 				'\0', ((nthreads - old_threads_used)
598 				       * sizeof (struct gomp_thread *)));
599 		      n = nthreads;
600 		      affinity_count = old_threads_used - i;
601 		    }
602 		  if (affinity_count == 0)
603 		    break;
604 		  l = p;
605 		  if (affinity_thr[l - team->prev_ts.place_partition_off]
606 		      == NULL)
607 		    {
608 		      if (bind != omp_proc_bind_true)
609 			continue;
610 		      for (l = place_partition_off;
611 			   l < place_partition_off + place_partition_len;
612 			   l++)
613 			if (affinity_thr[l - team->prev_ts.place_partition_off]
614 			    != NULL)
615 			  break;
616 		      if (l == place_partition_off + place_partition_len)
617 			continue;
618 		    }
619 		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
620 		  affinity_thr[l - team->prev_ts.place_partition_off]
621 		    = (struct gomp_thread *) nthr->data;
622 		  affinity_count--;
623 		  pool->threads[i] = nthr;
624 		}
625 	      else
626 		nthr = pool->threads[i];
627 	      place = p + 1;
628 	    }
629 	  else
630 	    nthr = pool->threads[i];
631 	  nthr->ts.team = team;
632 	  nthr->ts.work_share = &team->work_shares[0];
633 	  nthr->ts.last_work_share = NULL;
634 	  nthr->ts.team_id = i;
635 	  nthr->ts.level = team->prev_ts.level + 1;
636 	  nthr->ts.active_level = thr->ts.active_level;
637 	  nthr->ts.place_partition_off = place_partition_off;
638 	  nthr->ts.place_partition_len = place_partition_len;
639 #ifdef HAVE_SYNC_BUILTINS
640 	  nthr->ts.single_count = 0;
641 #endif
642 	  nthr->ts.static_trip = 0;
643 	  nthr->task = &team->implicit_task[i];
644 	  nthr->place = place;
645 	  gomp_init_task (nthr->task, task, icv);
646 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
647 	  team->implicit_task[i].icv.bind_var = bind_var;
648 	  nthr->task->taskgroup = taskgroup;
649 	  nthr->fn = fn;
650 	  nthr->data = data;
651 	  team->ordered_release[i] = &nthr->release;
652 	}
653 
654       if (__builtin_expect (affinity_thr != NULL, 0))
655 	{
656 	  /* If AFFINITY_THR is non-NULL just because we had to
657 	     permute some threads in the pool, but we've managed
658 	     to find exactly as many old threads as we'd find
659 	     without affinity, we don't need to handle this
660 	     specially anymore.  */
661 	  if (nthreads <= old_threads_used
662 	      ? (affinity_count == old_threads_used - nthreads)
663 	      : (i == old_threads_used))
664 	    {
665 	      if (team->prev_ts.place_partition_len > 64)
666 		free (affinity_thr);
667 	      affinity_thr = NULL;
668 	      affinity_count = 0;
669 	    }
670 	  else
671 	    {
672 	      i = 1;
673 	      /* We are going to compute the places/subpartitions
674 		 again from the beginning.  So, we need to reinitialize
675 		 vars modified by the switch (bind) above inside
676 		 of the loop, to the state they had after the initial
677 		 switch (bind).  */
678 	      switch (bind)
679 		{
680 		case omp_proc_bind_true:
681 		case omp_proc_bind_close:
682 		  if (nthreads > thr->ts.place_partition_len)
683 		    /* T > P.  S has been changed, so needs
684 		       to be recomputed.  */
685 		    s = nthreads / thr->ts.place_partition_len;
686 		  k = 1;
687 		  p = thr->place - 1;
688 		  break;
689 		case omp_proc_bind_master:
690 		  /* No vars have been changed.  */
691 		  break;
692 		case omp_proc_bind_spread:
693 		  p = thr->ts.place_partition_off;
694 		  if (k != 0)
695 		    {
696 		      /* T > P.  */
697 		      s = nthreads / team->prev_ts.place_partition_len;
698 		      k = 1;
699 		    }
700 		  break;
701 		}
702 
703 	      /* Increase the barrier threshold to make sure all new
704 		 threads and all the threads we're going to let die
705 		 arrive before the team is released.  */
706 	      if (affinity_count)
707 		gomp_simple_barrier_reinit (&pool->threads_dock,
708 					    nthreads + affinity_count);
709 	    }
710 	}
711 
712       if (i == nthreads)
713 	goto do_release;
714 
715     }
716 
717   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
718     {
719       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
720 
721       if (old_threads_used == 0)
722 	--diff;
723 
724 #ifdef HAVE_SYNC_BUILTINS
725       __sync_fetch_and_add (&gomp_managed_threads, diff);
726 #else
727       gomp_mutex_lock (&gomp_managed_threads_lock);
728       gomp_managed_threads += diff;
729       gomp_mutex_unlock (&gomp_managed_threads_lock);
730 #endif
731     }
732 
733   attr = &gomp_thread_attr;
734   if (__builtin_expect (gomp_places_list != NULL, 0))
735     {
736       size_t stacksize;
737       pthread_attr_init (&thread_attr);
738       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
739 	pthread_attr_setstacksize (&thread_attr, stacksize);
740       attr = &thread_attr;
741     }
742 
743   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
744 			    * (nthreads - i));
745 
746   /* Launch new threads.  */
747   for (; i < nthreads; ++i)
748     {
749       int err;
750 
751       start_data->ts.place_partition_off = thr->ts.place_partition_off;
752       start_data->ts.place_partition_len = thr->ts.place_partition_len;
753       start_data->place = 0;
754       if (__builtin_expect (gomp_places_list != NULL, 0))
755 	{
756 	  switch (bind)
757 	    {
758 	    case omp_proc_bind_true:
759 	    case omp_proc_bind_close:
760 	      if (k == s)
761 		{
762 		  ++p;
763 		  if (p == (team->prev_ts.place_partition_off
764 			    + team->prev_ts.place_partition_len))
765 		    p = team->prev_ts.place_partition_off;
766 		  k = 1;
767 		  if (i == nthreads - rest)
768 		    s = 1;
769 		}
770 	      else
771 		++k;
772 	      break;
773 	    case omp_proc_bind_master:
774 	      break;
775 	    case omp_proc_bind_spread:
776 	      if (k == 0)
777 		{
778 		  /* T <= P.  */
779 		  if (p < rest)
780 		    p += s + 1;
781 		  else
782 		    p += s;
783 		  if (p == (team->prev_ts.place_partition_off
784 			    + team->prev_ts.place_partition_len))
785 		    p = team->prev_ts.place_partition_off;
786 		  start_data->ts.place_partition_off = p;
787 		  if (p < rest)
788 		    start_data->ts.place_partition_len = s + 1;
789 		  else
790 		    start_data->ts.place_partition_len = s;
791 		}
792 	      else
793 		{
794 		  /* T > P.  */
795 		  if (k == s)
796 		    {
797 		      ++p;
798 		      if (p == (team->prev_ts.place_partition_off
799 				+ team->prev_ts.place_partition_len))
800 			p = team->prev_ts.place_partition_off;
801 		      k = 1;
802 		      if (i == nthreads - rest)
803 			s = 1;
804 		    }
805 		  else
806 		    ++k;
807 		  start_data->ts.place_partition_off = p;
808 		  start_data->ts.place_partition_len = 1;
809 		}
810 	      break;
811 	    }
812 	  start_data->place = p + 1;
813 	  if (affinity_thr != NULL && pool->threads[i] != NULL)
814 	    continue;
815 	  gomp_init_thread_affinity (attr, p);
816 	}
817 
818       start_data->fn = fn;
819       start_data->fn_data = data;
820       start_data->ts.team = team;
821       start_data->ts.work_share = &team->work_shares[0];
822       start_data->ts.last_work_share = NULL;
823       start_data->ts.team_id = i;
824       start_data->ts.level = team->prev_ts.level + 1;
825       start_data->ts.active_level = thr->ts.active_level;
826 #ifdef HAVE_SYNC_BUILTINS
827       start_data->ts.single_count = 0;
828 #endif
829       start_data->ts.static_trip = 0;
830       start_data->task = &team->implicit_task[i];
831       gomp_init_task (start_data->task, task, icv);
832       team->implicit_task[i].icv.nthreads_var = nthreads_var;
833       team->implicit_task[i].icv.bind_var = bind_var;
834       start_data->task->taskgroup = taskgroup;
835       start_data->thread_pool = pool;
836       start_data->nested = nested;
837 
838       attr = gomp_adjust_thread_attr (attr, &thread_attr);
839       err = pthread_create (&start_data->handle, attr, gomp_thread_start,
840 			    start_data);
841       start_data++;
842       if (err != 0)
843 	gomp_fatal ("Thread creation failed: %s", strerror (err));
844     }
845 
846   if (__builtin_expect (attr == &thread_attr, 0))
847     pthread_attr_destroy (&thread_attr);
848 
849  do_release:
850   if (nested)
851     gomp_barrier_wait (&team->barrier);
852   else
853     gomp_simple_barrier_wait (&pool->threads_dock);
854 
855   /* Decrease the barrier threshold to match the number of threads
856      that should arrive back at the end of this team.  The extra
857      threads should be exiting.  Note that we arrange for this test
858      to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
859      the barrier as well as gomp_managed_threads was temporarily
860      set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
861      AFFINITY_COUNT if non-zero will be always at least
862      OLD_THREADS_COUNT - NTHREADS.  */
863   if (__builtin_expect (nthreads < old_threads_used, 0)
864       || __builtin_expect (affinity_count, 0))
865     {
866       long diff = (long) nthreads - (long) old_threads_used;
867 
868       if (affinity_count)
869 	diff = -affinity_count;
870 
871       gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
872 
873 #ifdef HAVE_SYNC_BUILTINS
874       __sync_fetch_and_add (&gomp_managed_threads, diff);
875 #else
876       gomp_mutex_lock (&gomp_managed_threads_lock);
877       gomp_managed_threads += diff;
878       gomp_mutex_unlock (&gomp_managed_threads_lock);
879 #endif
880     }
881   if (__builtin_expect (gomp_display_affinity_var, 0))
882     {
883       if (nested
884 	  || nthreads != old_threads_used
885 	  || force_display)
886 	{
887 	  gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
888 					thr->place);
889 	  if (nested)
890 	    {
891 	      start_data -= nthreads - 1;
892 	      for (i = 1; i < nthreads; ++i)
893 		{
894 		  gomp_display_affinity_thread (
895 #ifdef LIBGOMP_USE_PTHREADS
896 						start_data->handle,
897 #else
898 						gomp_thread_self (),
899 #endif
900 						&start_data->ts,
901 						start_data->place);
902 		  start_data++;
903 		}
904 	    }
905 	  else
906 	    {
907 	      for (i = 1; i < nthreads; ++i)
908 		{
909 		  gomp_thread_handle handle
910 		    = gomp_thread_to_pthread_t (pool->threads[i]);
911 		  gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
912 						pool->threads[i]->place);
913 		}
914 	    }
915 	}
916     }
917   if (__builtin_expect (affinity_thr != NULL, 0)
918       && team->prev_ts.place_partition_len > 64)
919     free (affinity_thr);
920 }
921 #endif
922 
923 
924 /* Terminate the current team.  This is only to be called by the master
925    thread.  We assume that we must wait for the other threads.  */
926 
927 void
928 gomp_team_end (void)
929 {
930   struct gomp_thread *thr = gomp_thread ();
931   struct gomp_team *team = thr->ts.team;
932 
933   /* This barrier handles all pending explicit threads.
934      As #pragma omp cancel parallel might get awaited count in
935      team->barrier in a inconsistent state, we need to use a different
936      counter here.  */
937   gomp_team_barrier_wait_final (&team->barrier);
938   if (__builtin_expect (team->team_cancelled, 0))
939     {
940       struct gomp_work_share *ws = team->work_shares_to_free;
941       do
942 	{
943 	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
944 	  if (next_ws == NULL)
945 	    gomp_ptrlock_set (&ws->next_ws, ws);
946 	  gomp_fini_work_share (ws);
947 	  ws = next_ws;
948 	}
949       while (ws != NULL);
950     }
951   else
952     gomp_fini_work_share (thr->ts.work_share);
953 
954   gomp_end_task ();
955   thr->ts = team->prev_ts;
956 
957   if (__builtin_expect (thr->ts.level != 0, 0))
958     {
959 #ifdef HAVE_SYNC_BUILTINS
960       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
961 #else
962       gomp_mutex_lock (&gomp_managed_threads_lock);
963       gomp_managed_threads -= team->nthreads - 1L;
964       gomp_mutex_unlock (&gomp_managed_threads_lock);
965 #endif
966       /* This barrier has gomp_barrier_wait_last counterparts
967 	 and ensures the team can be safely destroyed.  */
968       gomp_barrier_wait (&team->barrier);
969     }
970 
971   if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
972     {
973       struct gomp_work_share *ws = team->work_shares[0].next_alloc;
974       do
975 	{
976 	  struct gomp_work_share *next_ws = ws->next_alloc;
977 	  free (ws);
978 	  ws = next_ws;
979 	}
980       while (ws != NULL);
981     }
982   gomp_sem_destroy (&team->master_release);
983 
984   if (__builtin_expect (thr->ts.team != NULL, 0)
985       || __builtin_expect (team->nthreads == 1, 0))
986     free_team (team);
987   else
988     {
989       struct gomp_thread_pool *pool = thr->thread_pool;
990       if (pool->last_team)
991 	free_team (pool->last_team);
992       pool->last_team = team;
993       gomp_release_thread_pool (pool);
994     }
995 }
996 
997 #ifdef LIBGOMP_USE_PTHREADS
998 
999 /* Constructors for this file.  */
1000 
1001 static void __attribute__((constructor))
1002 initialize_team (void)
1003 {
1004 #if !defined HAVE_TLS && !defined USE_EMUTLS
1005   static struct gomp_thread initial_thread_tls_data;
1006 
1007   pthread_key_create (&gomp_tls_key, NULL);
1008   pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
1009 #endif
1010 
1011   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
1012     gomp_fatal ("could not create thread pool destructor.");
1013 }
1014 
1015 static void __attribute__((destructor))
1016 team_destructor (void)
1017 {
1018   /* Without this dlclose on libgomp could lead to subsequent
1019      crashes.  */
1020   pthread_key_delete (gomp_thread_destructor);
1021 }
1022 
1023 /* Similar to gomp_free_pool_helper, but don't detach itself,
1024    gomp_pause_host will pthread_join those threads.  */
1025 
1026 static void
1027 gomp_pause_pool_helper (void *thread_pool)
1028 {
1029   struct gomp_thread *thr = gomp_thread ();
1030   struct gomp_thread_pool *pool
1031     = (struct gomp_thread_pool *) thread_pool;
1032   gomp_simple_barrier_wait_last (&pool->threads_dock);
1033   gomp_sem_destroy (&thr->release);
1034   thr->thread_pool = NULL;
1035   thr->task = NULL;
1036   pthread_exit (NULL);
1037 }
1038 
1039 /* Free a thread pool and release its threads.  Return non-zero on
1040    failure.  */
1041 
1042 int
1043 gomp_pause_host (void)
1044 {
1045   struct gomp_thread *thr = gomp_thread ();
1046   struct gomp_thread_pool *pool = thr->thread_pool;
1047   if (thr->ts.level)
1048     return -1;
1049   if (pool)
1050     {
1051       if (pool->threads_used > 0)
1052 	{
1053 	  int i;
1054 	  pthread_t *thrs
1055 	    = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
1056 	  for (i = 1; i < pool->threads_used; i++)
1057 	    {
1058 	      struct gomp_thread *nthr = pool->threads[i];
1059 	      nthr->fn = gomp_pause_pool_helper;
1060 	      nthr->data = pool;
1061 	      thrs[i] = gomp_thread_to_pthread_t (nthr);
1062 	    }
1063 	  /* This barrier undocks threads docked on pool->threads_dock.  */
1064 	  gomp_simple_barrier_wait (&pool->threads_dock);
1065 	  /* And this waits till all threads have called gomp_barrier_wait_last
1066 	     in gomp_pause_pool_helper.  */
1067 	  gomp_simple_barrier_wait (&pool->threads_dock);
1068 	  /* Now it is safe to destroy the barrier and free the pool.  */
1069 	  gomp_simple_barrier_destroy (&pool->threads_dock);
1070 
1071 #ifdef HAVE_SYNC_BUILTINS
1072 	  __sync_fetch_and_add (&gomp_managed_threads,
1073 				1L - pool->threads_used);
1074 #else
1075 	  gomp_mutex_lock (&gomp_managed_threads_lock);
1076 	  gomp_managed_threads -= pool->threads_used - 1L;
1077 	  gomp_mutex_unlock (&gomp_managed_threads_lock);
1078 #endif
1079 	  for (i = 1; i < pool->threads_used; i++)
1080 	    pthread_join (thrs[i], NULL);
1081 	}
1082       if (pool->last_team)
1083 	free_team (pool->last_team);
1084 #ifndef __nvptx__
1085       team_free (pool->threads);
1086       team_free (pool);
1087 #endif
1088       thr->thread_pool = NULL;
1089     }
1090   return 0;
1091 }
1092 #endif
1093 
1094 struct gomp_task_icv *
1095 gomp_new_icv (void)
1096 {
1097   struct gomp_thread *thr = gomp_thread ();
1098   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
1099   gomp_init_task (task, NULL, &gomp_global_icv);
1100   thr->task = task;
1101 #ifdef LIBGOMP_USE_PTHREADS
1102   pthread_setspecific (gomp_thread_destructor, thr);
1103 #endif
1104   return &task->icv;
1105 }
1106