xref: /netbsd-src/external/gpl3/gcc.old/dist/libgomp/team.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Copyright (C) 2005-2015 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file handles the maintainence of threads in response to team
27    creation and termination.  */
28 
29 #include "libgomp.h"
30 #include <stdlib.h>
31 #include <string.h>
32 
33 /* This attribute contains PTHREAD_CREATE_DETACHED.  */
34 pthread_attr_t gomp_thread_attr;
35 
36 /* This key is for the thread destructor.  */
37 pthread_key_t gomp_thread_destructor;
38 
39 
40 /* This is the libgomp per-thread data structure.  */
41 #if defined HAVE_TLS || defined USE_EMUTLS
42 __thread struct gomp_thread gomp_tls_data;
43 #else
44 pthread_key_t gomp_tls_key;
45 #endif
46 
47 
48 /* This structure is used to communicate across pthread_create.  */
49 
50 struct gomp_thread_start_data
51 {
52   void (*fn) (void *);
53   void *fn_data;
54   struct gomp_team_state ts;
55   struct gomp_task *task;
56   struct gomp_thread_pool *thread_pool;
57   unsigned int place;
58   bool nested;
59 };
60 
61 
62 /* This function is a pthread_create entry point.  This contains the idle
63    loop in which a thread waits to be called up to become part of a team.  */
64 
65 static void *
66 gomp_thread_start (void *xdata)
67 {
68   struct gomp_thread_start_data *data = xdata;
69   struct gomp_thread *thr;
70   struct gomp_thread_pool *pool;
71   void (*local_fn) (void *);
72   void *local_data;
73 
74 #if defined HAVE_TLS || defined USE_EMUTLS
75   thr = &gomp_tls_data;
76 #else
77   struct gomp_thread local_thr;
78   thr = &local_thr;
79   pthread_setspecific (gomp_tls_key, thr);
80 #endif
81   gomp_sem_init (&thr->release, 0);
82 
83   /* Extract what we need from data.  */
84   local_fn = data->fn;
85   local_data = data->fn_data;
86   thr->thread_pool = data->thread_pool;
87   thr->ts = data->ts;
88   thr->task = data->task;
89   thr->place = data->place;
90 
91   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
92 
93   /* Make thread pool local. */
94   pool = thr->thread_pool;
95 
96   if (data->nested)
97     {
98       struct gomp_team *team = thr->ts.team;
99       struct gomp_task *task = thr->task;
100 
101       gomp_barrier_wait (&team->barrier);
102 
103       local_fn (local_data);
104       gomp_team_barrier_wait_final (&team->barrier);
105       gomp_finish_task (task);
106       gomp_barrier_wait_last (&team->barrier);
107     }
108   else
109     {
110       pool->threads[thr->ts.team_id] = thr;
111 
112       gomp_barrier_wait (&pool->threads_dock);
113       do
114 	{
115 	  struct gomp_team *team = thr->ts.team;
116 	  struct gomp_task *task = thr->task;
117 
118 	  local_fn (local_data);
119 	  gomp_team_barrier_wait_final (&team->barrier);
120 	  gomp_finish_task (task);
121 
122 	  gomp_barrier_wait (&pool->threads_dock);
123 
124 	  local_fn = thr->fn;
125 	  local_data = thr->data;
126 	  thr->fn = NULL;
127 	}
128       while (local_fn);
129     }
130 
131   gomp_sem_destroy (&thr->release);
132   thr->thread_pool = NULL;
133   thr->task = NULL;
134   return NULL;
135 }
136 
137 
138 /* Create a new team data structure.  */
139 
140 struct gomp_team *
141 gomp_new_team (unsigned nthreads)
142 {
143   struct gomp_team *team;
144   size_t size;
145   int i;
146 
147   size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
148 				      + sizeof (team->implicit_task[0]));
149   team = gomp_malloc (size);
150 
151   team->work_share_chunk = 8;
152 #ifdef HAVE_SYNC_BUILTINS
153   team->single_count = 0;
154 #else
155   gomp_mutex_init (&team->work_share_list_free_lock);
156 #endif
157   team->work_shares_to_free = &team->work_shares[0];
158   gomp_init_work_share (&team->work_shares[0], false, nthreads);
159   team->work_shares[0].next_alloc = NULL;
160   team->work_share_list_free = NULL;
161   team->work_share_list_alloc = &team->work_shares[1];
162   for (i = 1; i < 7; i++)
163     team->work_shares[i].next_free = &team->work_shares[i + 1];
164   team->work_shares[i].next_free = NULL;
165 
166   team->nthreads = nthreads;
167   gomp_barrier_init (&team->barrier, nthreads);
168 
169   gomp_sem_init (&team->master_release, 0);
170   team->ordered_release = (void *) &team->implicit_task[nthreads];
171   team->ordered_release[0] = &team->master_release;
172 
173   gomp_mutex_init (&team->task_lock);
174   team->task_queue = NULL;
175   team->task_count = 0;
176   team->task_queued_count = 0;
177   team->task_running_count = 0;
178   team->work_share_cancelled = 0;
179   team->team_cancelled = 0;
180 
181   return team;
182 }
183 
184 
185 /* Free a team data structure.  */
186 
187 static void
188 free_team (struct gomp_team *team)
189 {
190   gomp_barrier_destroy (&team->barrier);
191   gomp_mutex_destroy (&team->task_lock);
192   free (team);
193 }
194 
195 /* Allocate and initialize a thread pool. */
196 
197 static struct gomp_thread_pool *gomp_new_thread_pool (void)
198 {
199   struct gomp_thread_pool *pool
200     = gomp_malloc (sizeof(struct gomp_thread_pool));
201   pool->threads = NULL;
202   pool->threads_size = 0;
203   pool->threads_used = 0;
204   pool->last_team = NULL;
205   return pool;
206 }
207 
208 static void
209 gomp_free_pool_helper (void *thread_pool)
210 {
211   struct gomp_thread *thr = gomp_thread ();
212   struct gomp_thread_pool *pool
213     = (struct gomp_thread_pool *) thread_pool;
214   gomp_barrier_wait_last (&pool->threads_dock);
215   gomp_sem_destroy (&thr->release);
216   thr->thread_pool = NULL;
217   thr->task = NULL;
218   pthread_exit (NULL);
219 }
220 
221 /* Free a thread pool and release its threads. */
222 
223 void
224 gomp_free_thread (void *arg __attribute__((unused)))
225 {
226   struct gomp_thread *thr = gomp_thread ();
227   struct gomp_thread_pool *pool = thr->thread_pool;
228   if (pool)
229     {
230       if (pool->threads_used > 0)
231 	{
232 	  int i;
233 	  for (i = 1; i < pool->threads_used; i++)
234 	    {
235 	      struct gomp_thread *nthr = pool->threads[i];
236 	      nthr->fn = gomp_free_pool_helper;
237 	      nthr->data = pool;
238 	    }
239 	  /* This barrier undocks threads docked on pool->threads_dock.  */
240 	  gomp_barrier_wait (&pool->threads_dock);
241 	  /* And this waits till all threads have called gomp_barrier_wait_last
242 	     in gomp_free_pool_helper.  */
243 	  gomp_barrier_wait (&pool->threads_dock);
244 	  /* Now it is safe to destroy the barrier and free the pool.  */
245 	  gomp_barrier_destroy (&pool->threads_dock);
246 
247 #ifdef HAVE_SYNC_BUILTINS
248 	  __sync_fetch_and_add (&gomp_managed_threads,
249 				1L - pool->threads_used);
250 #else
251 	  gomp_mutex_lock (&gomp_managed_threads_lock);
252 	  gomp_managed_threads -= pool->threads_used - 1L;
253 	  gomp_mutex_unlock (&gomp_managed_threads_lock);
254 #endif
255 	}
256       free (pool->threads);
257       if (pool->last_team)
258 	free_team (pool->last_team);
259       free (pool);
260       thr->thread_pool = NULL;
261     }
262   if (thr->task != NULL)
263     {
264       struct gomp_task *task = thr->task;
265       gomp_end_task ();
266       free (task);
267     }
268 }
269 
270 /* Launch a team.  */
271 
272 void
273 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
274 		 unsigned flags, struct gomp_team *team)
275 {
276   struct gomp_thread_start_data *start_data;
277   struct gomp_thread *thr, *nthr;
278   struct gomp_task *task;
279   struct gomp_task_icv *icv;
280   bool nested;
281   struct gomp_thread_pool *pool;
282   unsigned i, n, old_threads_used = 0;
283   pthread_attr_t thread_attr, *attr;
284   unsigned long nthreads_var;
285   char bind, bind_var;
286   unsigned int s = 0, rest = 0, p = 0, k = 0;
287   unsigned int affinity_count = 0;
288   struct gomp_thread **affinity_thr = NULL;
289 
290   thr = gomp_thread ();
291   nested = thr->ts.team != NULL;
292   if (__builtin_expect (thr->thread_pool == NULL, 0))
293     {
294       thr->thread_pool = gomp_new_thread_pool ();
295       thr->thread_pool->threads_busy = nthreads;
296       pthread_setspecific (gomp_thread_destructor, thr);
297     }
298   pool = thr->thread_pool;
299   task = thr->task;
300   icv = task ? &task->icv : &gomp_global_icv;
301   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
302     gomp_init_affinity ();
303 
304   /* Always save the previous state, even if this isn't a nested team.
305      In particular, we should save any work share state from an outer
306      orphaned work share construct.  */
307   team->prev_ts = thr->ts;
308 
309   thr->ts.team = team;
310   thr->ts.team_id = 0;
311   ++thr->ts.level;
312   if (nthreads > 1)
313     ++thr->ts.active_level;
314   thr->ts.work_share = &team->work_shares[0];
315   thr->ts.last_work_share = NULL;
316 #ifdef HAVE_SYNC_BUILTINS
317   thr->ts.single_count = 0;
318 #endif
319   thr->ts.static_trip = 0;
320   thr->task = &team->implicit_task[0];
321   nthreads_var = icv->nthreads_var;
322   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
323       && thr->ts.level < gomp_nthreads_var_list_len)
324     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
325   bind_var = icv->bind_var;
326   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
327     bind_var = flags & 7;
328   bind = bind_var;
329   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
330       && thr->ts.level < gomp_bind_var_list_len)
331     bind_var = gomp_bind_var_list[thr->ts.level];
332   gomp_init_task (thr->task, task, icv);
333   team->implicit_task[0].icv.nthreads_var = nthreads_var;
334   team->implicit_task[0].icv.bind_var = bind_var;
335 
336   if (nthreads == 1)
337     return;
338 
339   i = 1;
340 
341   if (__builtin_expect (gomp_places_list != NULL, 0))
342     {
343       /* Depending on chosen proc_bind model, set subpartition
344 	 for the master thread and initialize helper variables
345 	 P and optionally S, K and/or REST used by later place
346 	 computation for each additional thread.  */
347       p = thr->place - 1;
348       switch (bind)
349 	{
350 	case omp_proc_bind_true:
351 	case omp_proc_bind_close:
352 	  if (nthreads > thr->ts.place_partition_len)
353 	    {
354 	      /* T > P.  S threads will be placed in each place,
355 		 and the final REM threads placed one by one
356 		 into the already occupied places.  */
357 	      s = nthreads / thr->ts.place_partition_len;
358 	      rest = nthreads % thr->ts.place_partition_len;
359 	    }
360 	  else
361 	    s = 1;
362 	  k = 1;
363 	  break;
364 	case omp_proc_bind_master:
365 	  /* Each thread will be bound to master's place.  */
366 	  break;
367 	case omp_proc_bind_spread:
368 	  if (nthreads <= thr->ts.place_partition_len)
369 	    {
370 	      /* T <= P.  Each subpartition will have in between s
371 		 and s+1 places (subpartitions starting at or
372 		 after rest will have s places, earlier s+1 places),
373 		 each thread will be bound to the first place in
374 		 its subpartition (except for the master thread
375 		 that can be bound to another place in its
376 		 subpartition).  */
377 	      s = thr->ts.place_partition_len / nthreads;
378 	      rest = thr->ts.place_partition_len % nthreads;
379 	      rest = (s + 1) * rest + thr->ts.place_partition_off;
380 	      if (p < rest)
381 		{
382 		  p -= (p - thr->ts.place_partition_off) % (s + 1);
383 		  thr->ts.place_partition_len = s + 1;
384 		}
385 	      else
386 		{
387 		  p -= (p - rest) % s;
388 		  thr->ts.place_partition_len = s;
389 		}
390 	      thr->ts.place_partition_off = p;
391 	    }
392 	  else
393 	    {
394 	      /* T > P.  Each subpartition will have just a single
395 		 place and we'll place between s and s+1
396 		 threads into each subpartition.  */
397 	      s = nthreads / thr->ts.place_partition_len;
398 	      rest = nthreads % thr->ts.place_partition_len;
399 	      thr->ts.place_partition_off = p;
400 	      thr->ts.place_partition_len = 1;
401 	      k = 1;
402 	    }
403 	  break;
404 	}
405     }
406   else
407     bind = omp_proc_bind_false;
408 
409   /* We only allow the reuse of idle threads for non-nested PARALLEL
410      regions.  This appears to be implied by the semantics of
411      threadprivate variables, but perhaps that's reading too much into
412      things.  Certainly it does prevent any locking problems, since
413      only the initial program thread will modify gomp_threads.  */
414   if (!nested)
415     {
416       old_threads_used = pool->threads_used;
417 
418       if (nthreads <= old_threads_used)
419 	n = nthreads;
420       else if (old_threads_used == 0)
421 	{
422 	  n = 0;
423 	  gomp_barrier_init (&pool->threads_dock, nthreads);
424 	}
425       else
426 	{
427 	  n = old_threads_used;
428 
429 	  /* Increase the barrier threshold to make sure all new
430 	     threads arrive before the team is released.  */
431 	  gomp_barrier_reinit (&pool->threads_dock, nthreads);
432 	}
433 
434       /* Not true yet, but soon will be.  We're going to release all
435 	 threads from the dock, and those that aren't part of the
436 	 team will exit.  */
437       pool->threads_used = nthreads;
438 
439       /* If necessary, expand the size of the gomp_threads array.  It is
440 	 expected that changes in the number of threads are rare, thus we
441 	 make no effort to expand gomp_threads_size geometrically.  */
442       if (nthreads >= pool->threads_size)
443 	{
444 	  pool->threads_size = nthreads + 1;
445 	  pool->threads
446 	    = gomp_realloc (pool->threads,
447 			    pool->threads_size
448 			    * sizeof (struct gomp_thread_data *));
449 	}
450 
451       /* Release existing idle threads.  */
452       for (; i < n; ++i)
453 	{
454 	  unsigned int place_partition_off = thr->ts.place_partition_off;
455 	  unsigned int place_partition_len = thr->ts.place_partition_len;
456 	  unsigned int place = 0;
457 	  if (__builtin_expect (gomp_places_list != NULL, 0))
458 	    {
459 	      switch (bind)
460 		{
461 		case omp_proc_bind_true:
462 		case omp_proc_bind_close:
463 		  if (k == s)
464 		    {
465 		      ++p;
466 		      if (p == (team->prev_ts.place_partition_off
467 				+ team->prev_ts.place_partition_len))
468 			p = team->prev_ts.place_partition_off;
469 		      k = 1;
470 		      if (i == nthreads - rest)
471 			s = 1;
472 		    }
473 		  else
474 		    ++k;
475 		  break;
476 		case omp_proc_bind_master:
477 		  break;
478 		case omp_proc_bind_spread:
479 		  if (k == 0)
480 		    {
481 		      /* T <= P.  */
482 		      if (p < rest)
483 			p += s + 1;
484 		      else
485 			p += s;
486 		      if (p == (team->prev_ts.place_partition_off
487 				+ team->prev_ts.place_partition_len))
488 			p = team->prev_ts.place_partition_off;
489 		      place_partition_off = p;
490 		      if (p < rest)
491 			place_partition_len = s + 1;
492 		      else
493 			place_partition_len = s;
494 		    }
495 		  else
496 		    {
497 		      /* T > P.  */
498 		      if (k == s)
499 			{
500 			  ++p;
501 			  if (p == (team->prev_ts.place_partition_off
502 				    + team->prev_ts.place_partition_len))
503 			    p = team->prev_ts.place_partition_off;
504 			  k = 1;
505 			  if (i == nthreads - rest)
506 			    s = 1;
507 			}
508 		      else
509 			++k;
510 		      place_partition_off = p;
511 		      place_partition_len = 1;
512 		    }
513 		  break;
514 		}
515 	      if (affinity_thr != NULL
516 		  || (bind != omp_proc_bind_true
517 		      && pool->threads[i]->place != p + 1)
518 		  || pool->threads[i]->place <= place_partition_off
519 		  || pool->threads[i]->place > (place_partition_off
520 						+ place_partition_len))
521 		{
522 		  unsigned int l;
523 		  if (affinity_thr == NULL)
524 		    {
525 		      unsigned int j;
526 
527 		      if (team->prev_ts.place_partition_len > 64)
528 			affinity_thr
529 			  = gomp_malloc (team->prev_ts.place_partition_len
530 					 * sizeof (struct gomp_thread *));
531 		      else
532 			affinity_thr
533 			  = gomp_alloca (team->prev_ts.place_partition_len
534 					 * sizeof (struct gomp_thread *));
535 		      memset (affinity_thr, '\0',
536 			      team->prev_ts.place_partition_len
537 			      * sizeof (struct gomp_thread *));
538 		      for (j = i; j < old_threads_used; j++)
539 			{
540 			  if (pool->threads[j]->place
541 			      > team->prev_ts.place_partition_off
542 			      && (pool->threads[j]->place
543 				  <= (team->prev_ts.place_partition_off
544 				      + team->prev_ts.place_partition_len)))
545 			    {
546 			      l = pool->threads[j]->place - 1
547 				  - team->prev_ts.place_partition_off;
548 			      pool->threads[j]->data = affinity_thr[l];
549 			      affinity_thr[l] = pool->threads[j];
550 			    }
551 			  pool->threads[j] = NULL;
552 			}
553 		      if (nthreads > old_threads_used)
554 			memset (&pool->threads[old_threads_used],
555 				'\0', ((nthreads - old_threads_used)
556 				       * sizeof (struct gomp_thread *)));
557 		      n = nthreads;
558 		      affinity_count = old_threads_used - i;
559 		    }
560 		  if (affinity_count == 0)
561 		    break;
562 		  l = p;
563 		  if (affinity_thr[l - team->prev_ts.place_partition_off]
564 		      == NULL)
565 		    {
566 		      if (bind != omp_proc_bind_true)
567 			continue;
568 		      for (l = place_partition_off;
569 			   l < place_partition_off + place_partition_len;
570 			   l++)
571 			if (affinity_thr[l - team->prev_ts.place_partition_off]
572 			    != NULL)
573 			  break;
574 		      if (l == place_partition_off + place_partition_len)
575 			continue;
576 		    }
577 		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
578 		  affinity_thr[l - team->prev_ts.place_partition_off]
579 		    = (struct gomp_thread *) nthr->data;
580 		  affinity_count--;
581 		  pool->threads[i] = nthr;
582 		}
583 	      else
584 		nthr = pool->threads[i];
585 	      place = p + 1;
586 	    }
587 	  else
588 	    nthr = pool->threads[i];
589 	  nthr->ts.team = team;
590 	  nthr->ts.work_share = &team->work_shares[0];
591 	  nthr->ts.last_work_share = NULL;
592 	  nthr->ts.team_id = i;
593 	  nthr->ts.level = team->prev_ts.level + 1;
594 	  nthr->ts.active_level = thr->ts.active_level;
595 	  nthr->ts.place_partition_off = place_partition_off;
596 	  nthr->ts.place_partition_len = place_partition_len;
597 #ifdef HAVE_SYNC_BUILTINS
598 	  nthr->ts.single_count = 0;
599 #endif
600 	  nthr->ts.static_trip = 0;
601 	  nthr->task = &team->implicit_task[i];
602 	  nthr->place = place;
603 	  gomp_init_task (nthr->task, task, icv);
604 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
605 	  team->implicit_task[i].icv.bind_var = bind_var;
606 	  nthr->fn = fn;
607 	  nthr->data = data;
608 	  team->ordered_release[i] = &nthr->release;
609 	}
610 
611       if (__builtin_expect (affinity_thr != NULL, 0))
612 	{
613 	  /* If AFFINITY_THR is non-NULL just because we had to
614 	     permute some threads in the pool, but we've managed
615 	     to find exactly as many old threads as we'd find
616 	     without affinity, we don't need to handle this
617 	     specially anymore.  */
618 	  if (nthreads <= old_threads_used
619 	      ? (affinity_count == old_threads_used - nthreads)
620 	      : (i == old_threads_used))
621 	    {
622 	      if (team->prev_ts.place_partition_len > 64)
623 		free (affinity_thr);
624 	      affinity_thr = NULL;
625 	      affinity_count = 0;
626 	    }
627 	  else
628 	    {
629 	      i = 1;
630 	      /* We are going to compute the places/subpartitions
631 		 again from the beginning.  So, we need to reinitialize
632 		 vars modified by the switch (bind) above inside
633 		 of the loop, to the state they had after the initial
634 		 switch (bind).  */
635 	      switch (bind)
636 		{
637 		case omp_proc_bind_true:
638 		case omp_proc_bind_close:
639 		  if (nthreads > thr->ts.place_partition_len)
640 		    /* T > P.  S has been changed, so needs
641 		       to be recomputed.  */
642 		    s = nthreads / thr->ts.place_partition_len;
643 		  k = 1;
644 		  p = thr->place - 1;
645 		  break;
646 		case omp_proc_bind_master:
647 		  /* No vars have been changed.  */
648 		  break;
649 		case omp_proc_bind_spread:
650 		  p = thr->ts.place_partition_off;
651 		  if (k != 0)
652 		    {
653 		      /* T > P.  */
654 		      s = nthreads / team->prev_ts.place_partition_len;
655 		      k = 1;
656 		    }
657 		  break;
658 		}
659 
660 	      /* Increase the barrier threshold to make sure all new
661 		 threads and all the threads we're going to let die
662 		 arrive before the team is released.  */
663 	      if (affinity_count)
664 		gomp_barrier_reinit (&pool->threads_dock,
665 				     nthreads + affinity_count);
666 	    }
667 	}
668 
669       if (i == nthreads)
670 	goto do_release;
671 
672     }
673 
674   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
675     {
676       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
677 
678       if (old_threads_used == 0)
679 	--diff;
680 
681 #ifdef HAVE_SYNC_BUILTINS
682       __sync_fetch_and_add (&gomp_managed_threads, diff);
683 #else
684       gomp_mutex_lock (&gomp_managed_threads_lock);
685       gomp_managed_threads += diff;
686       gomp_mutex_unlock (&gomp_managed_threads_lock);
687 #endif
688     }
689 
690   attr = &gomp_thread_attr;
691   if (__builtin_expect (gomp_places_list != NULL, 0))
692     {
693       size_t stacksize;
694       pthread_attr_init (&thread_attr);
695       pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
696       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
697 	pthread_attr_setstacksize (&thread_attr, stacksize);
698       attr = &thread_attr;
699     }
700 
701   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
702 			    * (nthreads-i));
703 
704   /* Launch new threads.  */
705   for (; i < nthreads; ++i)
706     {
707       pthread_t pt;
708       int err;
709 
710       start_data->ts.place_partition_off = thr->ts.place_partition_off;
711       start_data->ts.place_partition_len = thr->ts.place_partition_len;
712       start_data->place = 0;
713       if (__builtin_expect (gomp_places_list != NULL, 0))
714 	{
715 	  switch (bind)
716 	    {
717 	    case omp_proc_bind_true:
718 	    case omp_proc_bind_close:
719 	      if (k == s)
720 		{
721 		  ++p;
722 		  if (p == (team->prev_ts.place_partition_off
723 			    + team->prev_ts.place_partition_len))
724 		    p = team->prev_ts.place_partition_off;
725 		  k = 1;
726 		  if (i == nthreads - rest)
727 		    s = 1;
728 		}
729 	      else
730 		++k;
731 	      break;
732 	    case omp_proc_bind_master:
733 	      break;
734 	    case omp_proc_bind_spread:
735 	      if (k == 0)
736 		{
737 		  /* T <= P.  */
738 		  if (p < rest)
739 		    p += s + 1;
740 		  else
741 		    p += s;
742 		  if (p == (team->prev_ts.place_partition_off
743 			    + team->prev_ts.place_partition_len))
744 		    p = team->prev_ts.place_partition_off;
745 		  start_data->ts.place_partition_off = p;
746 		  if (p < rest)
747 		    start_data->ts.place_partition_len = s + 1;
748 		  else
749 		    start_data->ts.place_partition_len = s;
750 		}
751 	      else
752 		{
753 		  /* T > P.  */
754 		  if (k == s)
755 		    {
756 		      ++p;
757 		      if (p == (team->prev_ts.place_partition_off
758 				+ team->prev_ts.place_partition_len))
759 			p = team->prev_ts.place_partition_off;
760 		      k = 1;
761 		      if (i == nthreads - rest)
762 			s = 1;
763 		    }
764 		  else
765 		    ++k;
766 		  start_data->ts.place_partition_off = p;
767 		  start_data->ts.place_partition_len = 1;
768 		}
769 	      break;
770 	    }
771 	  start_data->place = p + 1;
772 	  if (affinity_thr != NULL && pool->threads[i] != NULL)
773 	    continue;
774 	  gomp_init_thread_affinity (attr, p);
775 	}
776 
777       start_data->fn = fn;
778       start_data->fn_data = data;
779       start_data->ts.team = team;
780       start_data->ts.work_share = &team->work_shares[0];
781       start_data->ts.last_work_share = NULL;
782       start_data->ts.team_id = i;
783       start_data->ts.level = team->prev_ts.level + 1;
784       start_data->ts.active_level = thr->ts.active_level;
785 #ifdef HAVE_SYNC_BUILTINS
786       start_data->ts.single_count = 0;
787 #endif
788       start_data->ts.static_trip = 0;
789       start_data->task = &team->implicit_task[i];
790       gomp_init_task (start_data->task, task, icv);
791       team->implicit_task[i].icv.nthreads_var = nthreads_var;
792       team->implicit_task[i].icv.bind_var = bind_var;
793       start_data->thread_pool = pool;
794       start_data->nested = nested;
795 
796       err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
797       if (err != 0)
798 	gomp_fatal ("Thread creation failed: %s", strerror (err));
799     }
800 
801   if (__builtin_expect (gomp_places_list != NULL, 0))
802     pthread_attr_destroy (&thread_attr);
803 
804  do_release:
805   gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
806 
807   /* Decrease the barrier threshold to match the number of threads
808      that should arrive back at the end of this team.  The extra
809      threads should be exiting.  Note that we arrange for this test
810      to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
811      the barrier as well as gomp_managed_threads was temporarily
812      set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
813      AFFINITY_COUNT if non-zero will be always at least
814      OLD_THREADS_COUNT - NTHREADS.  */
815   if (__builtin_expect (nthreads < old_threads_used, 0)
816       || __builtin_expect (affinity_count, 0))
817     {
818       long diff = (long) nthreads - (long) old_threads_used;
819 
820       if (affinity_count)
821 	diff = -affinity_count;
822 
823       gomp_barrier_reinit (&pool->threads_dock, nthreads);
824 
825 #ifdef HAVE_SYNC_BUILTINS
826       __sync_fetch_and_add (&gomp_managed_threads, diff);
827 #else
828       gomp_mutex_lock (&gomp_managed_threads_lock);
829       gomp_managed_threads += diff;
830       gomp_mutex_unlock (&gomp_managed_threads_lock);
831 #endif
832     }
833   if (__builtin_expect (affinity_thr != NULL, 0)
834       && team->prev_ts.place_partition_len > 64)
835     free (affinity_thr);
836 }
837 
838 
839 /* Terminate the current team.  This is only to be called by the master
840    thread.  We assume that we must wait for the other threads.  */
841 
842 void
843 gomp_team_end (void)
844 {
845   struct gomp_thread *thr = gomp_thread ();
846   struct gomp_team *team = thr->ts.team;
847 
848   /* This barrier handles all pending explicit threads.
849      As #pragma omp cancel parallel might get awaited count in
850      team->barrier in a inconsistent state, we need to use a different
851      counter here.  */
852   gomp_team_barrier_wait_final (&team->barrier);
853   if (__builtin_expect (team->team_cancelled, 0))
854     {
855       struct gomp_work_share *ws = team->work_shares_to_free;
856       do
857 	{
858 	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
859 	  if (next_ws == NULL)
860 	    gomp_ptrlock_set (&ws->next_ws, ws);
861 	  gomp_fini_work_share (ws);
862 	  ws = next_ws;
863 	}
864       while (ws != NULL);
865     }
866   else
867     gomp_fini_work_share (thr->ts.work_share);
868 
869   gomp_end_task ();
870   thr->ts = team->prev_ts;
871 
872   if (__builtin_expect (thr->ts.team != NULL, 0))
873     {
874 #ifdef HAVE_SYNC_BUILTINS
875       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
876 #else
877       gomp_mutex_lock (&gomp_managed_threads_lock);
878       gomp_managed_threads -= team->nthreads - 1L;
879       gomp_mutex_unlock (&gomp_managed_threads_lock);
880 #endif
881       /* This barrier has gomp_barrier_wait_last counterparts
882 	 and ensures the team can be safely destroyed.  */
883       gomp_barrier_wait (&team->barrier);
884     }
885 
886   if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
887     {
888       struct gomp_work_share *ws = team->work_shares[0].next_alloc;
889       do
890 	{
891 	  struct gomp_work_share *next_ws = ws->next_alloc;
892 	  free (ws);
893 	  ws = next_ws;
894 	}
895       while (ws != NULL);
896     }
897   gomp_sem_destroy (&team->master_release);
898 #ifndef HAVE_SYNC_BUILTINS
899   gomp_mutex_destroy (&team->work_share_list_free_lock);
900 #endif
901 
902   if (__builtin_expect (thr->ts.team != NULL, 0)
903       || __builtin_expect (team->nthreads == 1, 0))
904     free_team (team);
905   else
906     {
907       struct gomp_thread_pool *pool = thr->thread_pool;
908       if (pool->last_team)
909 	free_team (pool->last_team);
910       pool->last_team = team;
911     }
912 }
913 
914 
915 /* Constructors for this file.  */
916 
917 static void __attribute__((constructor))
918 initialize_team (void)
919 {
920 #if !defined HAVE_TLS && !defined USE_EMUTLS
921   static struct gomp_thread initial_thread_tls_data;
922 
923   pthread_key_create (&gomp_tls_key, NULL);
924   pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
925 #endif
926 
927   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
928     gomp_fatal ("could not create thread pool destructor.");
929 }
930 
931 static void __attribute__((destructor))
932 team_destructor (void)
933 {
934   /* Without this dlclose on libgomp could lead to subsequent
935      crashes.  */
936   pthread_key_delete (gomp_thread_destructor);
937 }
938 
939 struct gomp_task_icv *
940 gomp_new_icv (void)
941 {
942   struct gomp_thread *thr = gomp_thread ();
943   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
944   gomp_init_task (task, NULL, &gomp_global_icv);
945   thr->task = task;
946   pthread_setspecific (gomp_thread_destructor, thr);
947   return &task->icv;
948 }
949