xref: /netbsd-src/external/gpl3/gcc/dist/libgomp/team.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Copyright (C) 2005-2016 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file handles the maintainence of threads in response to team
27    creation and termination.  */
28 
29 #include "libgomp.h"
30 #include "pool.h"
31 #include <stdlib.h>
32 #include <string.h>
33 
34 /* This attribute contains PTHREAD_CREATE_DETACHED.  */
35 pthread_attr_t gomp_thread_attr;
36 
37 /* This key is for the thread destructor.  */
38 pthread_key_t gomp_thread_destructor;
39 
40 
41 /* This is the libgomp per-thread data structure.  */
42 #if defined HAVE_TLS || defined USE_EMUTLS
43 __thread struct gomp_thread gomp_tls_data;
44 #else
45 pthread_key_t gomp_tls_key;
46 #endif
47 
48 
49 /* This structure is used to communicate across pthread_create.  */
50 
51 struct gomp_thread_start_data
52 {
53   void (*fn) (void *);
54   void *fn_data;
55   struct gomp_team_state ts;
56   struct gomp_task *task;
57   struct gomp_thread_pool *thread_pool;
58   unsigned int place;
59   bool nested;
60 };
61 
62 
63 /* This function is a pthread_create entry point.  This contains the idle
64    loop in which a thread waits to be called up to become part of a team.  */
65 
66 static void *
67 gomp_thread_start (void *xdata)
68 {
69   struct gomp_thread_start_data *data = xdata;
70   struct gomp_thread *thr;
71   struct gomp_thread_pool *pool;
72   void (*local_fn) (void *);
73   void *local_data;
74 
75 #if defined HAVE_TLS || defined USE_EMUTLS
76   thr = &gomp_tls_data;
77 #else
78   struct gomp_thread local_thr;
79   thr = &local_thr;
80   pthread_setspecific (gomp_tls_key, thr);
81 #endif
82   gomp_sem_init (&thr->release, 0);
83 
84   /* Extract what we need from data.  */
85   local_fn = data->fn;
86   local_data = data->fn_data;
87   thr->thread_pool = data->thread_pool;
88   thr->ts = data->ts;
89   thr->task = data->task;
90   thr->place = data->place;
91 
92   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
93 
94   /* Make thread pool local. */
95   pool = thr->thread_pool;
96 
97   if (data->nested)
98     {
99       struct gomp_team *team = thr->ts.team;
100       struct gomp_task *task = thr->task;
101 
102       gomp_barrier_wait (&team->barrier);
103 
104       local_fn (local_data);
105       gomp_team_barrier_wait_final (&team->barrier);
106       gomp_finish_task (task);
107       gomp_barrier_wait_last (&team->barrier);
108     }
109   else
110     {
111       pool->threads[thr->ts.team_id] = thr;
112 
113       gomp_barrier_wait (&pool->threads_dock);
114       do
115 	{
116 	  struct gomp_team *team = thr->ts.team;
117 	  struct gomp_task *task = thr->task;
118 
119 	  local_fn (local_data);
120 	  gomp_team_barrier_wait_final (&team->barrier);
121 	  gomp_finish_task (task);
122 
123 	  gomp_barrier_wait (&pool->threads_dock);
124 
125 	  local_fn = thr->fn;
126 	  local_data = thr->data;
127 	  thr->fn = NULL;
128 	}
129       while (local_fn);
130     }
131 
132   gomp_sem_destroy (&thr->release);
133   thr->thread_pool = NULL;
134   thr->task = NULL;
135   return NULL;
136 }
137 
138 static inline struct gomp_team *
139 get_last_team (unsigned nthreads)
140 {
141   struct gomp_thread *thr = gomp_thread ();
142   if (thr->ts.team == NULL)
143     {
144       struct gomp_thread_pool *pool = gomp_get_thread_pool (thr, nthreads);
145       struct gomp_team *last_team = pool->last_team;
146       if (last_team != NULL && last_team->nthreads == nthreads)
147         {
148           pool->last_team = NULL;
149           return last_team;
150         }
151     }
152   return NULL;
153 }
154 
155 /* Create a new team data structure.  */
156 
157 struct gomp_team *
158 gomp_new_team (unsigned nthreads)
159 {
160   struct gomp_team *team;
161   int i;
162 
163   team = get_last_team (nthreads);
164   if (team == NULL)
165     {
166       size_t extra = sizeof (team->ordered_release[0])
167 		     + sizeof (team->implicit_task[0]);
168       team = gomp_malloc (sizeof (*team) + nthreads * extra);
169 
170 #ifndef HAVE_SYNC_BUILTINS
171       gomp_mutex_init (&team->work_share_list_free_lock);
172 #endif
173       gomp_barrier_init (&team->barrier, nthreads);
174       gomp_mutex_init (&team->task_lock);
175 
176       team->nthreads = nthreads;
177     }
178 
179   team->work_share_chunk = 8;
180 #ifdef HAVE_SYNC_BUILTINS
181   team->single_count = 0;
182 #endif
183   team->work_shares_to_free = &team->work_shares[0];
184   gomp_init_work_share (&team->work_shares[0], false, nthreads);
185   team->work_shares[0].next_alloc = NULL;
186   team->work_share_list_free = NULL;
187   team->work_share_list_alloc = &team->work_shares[1];
188   for (i = 1; i < 7; i++)
189     team->work_shares[i].next_free = &team->work_shares[i + 1];
190   team->work_shares[i].next_free = NULL;
191 
192   gomp_sem_init (&team->master_release, 0);
193   team->ordered_release = (void *) &team->implicit_task[nthreads];
194   team->ordered_release[0] = &team->master_release;
195 
196   priority_queue_init (&team->task_queue);
197   team->task_count = 0;
198   team->task_queued_count = 0;
199   team->task_running_count = 0;
200   team->work_share_cancelled = 0;
201   team->team_cancelled = 0;
202 
203   return team;
204 }
205 
206 
207 /* Free a team data structure.  */
208 
209 static void
210 free_team (struct gomp_team *team)
211 {
212 #ifndef HAVE_SYNC_BUILTINS
213   gomp_mutex_destroy (&team->work_share_list_free_lock);
214 #endif
215   gomp_barrier_destroy (&team->barrier);
216   gomp_mutex_destroy (&team->task_lock);
217   priority_queue_free (&team->task_queue);
218   free (team);
219 }
220 
221 static void
222 gomp_free_pool_helper (void *thread_pool)
223 {
224   struct gomp_thread *thr = gomp_thread ();
225   struct gomp_thread_pool *pool
226     = (struct gomp_thread_pool *) thread_pool;
227   gomp_barrier_wait_last (&pool->threads_dock);
228   gomp_sem_destroy (&thr->release);
229   thr->thread_pool = NULL;
230   thr->task = NULL;
231   pthread_exit (NULL);
232 }
233 
234 /* Free a thread pool and release its threads. */
235 
236 void
237 gomp_free_thread (void *arg __attribute__((unused)))
238 {
239   struct gomp_thread *thr = gomp_thread ();
240   struct gomp_thread_pool *pool = thr->thread_pool;
241   if (pool)
242     {
243       if (pool->threads_used > 0)
244 	{
245 	  int i;
246 	  for (i = 1; i < pool->threads_used; i++)
247 	    {
248 	      struct gomp_thread *nthr = pool->threads[i];
249 	      nthr->fn = gomp_free_pool_helper;
250 	      nthr->data = pool;
251 	    }
252 	  /* This barrier undocks threads docked on pool->threads_dock.  */
253 	  gomp_barrier_wait (&pool->threads_dock);
254 	  /* And this waits till all threads have called gomp_barrier_wait_last
255 	     in gomp_free_pool_helper.  */
256 	  gomp_barrier_wait (&pool->threads_dock);
257 	  /* Now it is safe to destroy the barrier and free the pool.  */
258 	  gomp_barrier_destroy (&pool->threads_dock);
259 
260 #ifdef HAVE_SYNC_BUILTINS
261 	  __sync_fetch_and_add (&gomp_managed_threads,
262 				1L - pool->threads_used);
263 #else
264 	  gomp_mutex_lock (&gomp_managed_threads_lock);
265 	  gomp_managed_threads -= pool->threads_used - 1L;
266 	  gomp_mutex_unlock (&gomp_managed_threads_lock);
267 #endif
268 	}
269       free (pool->threads);
270       if (pool->last_team)
271 	free_team (pool->last_team);
272       free (pool);
273       thr->thread_pool = NULL;
274     }
275   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
276     gomp_team_end ();
277   if (thr->task != NULL)
278     {
279       struct gomp_task *task = thr->task;
280       gomp_end_task ();
281       free (task);
282     }
283 }
284 
285 /* Launch a team.  */
286 
287 void
288 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
289 		 unsigned flags, struct gomp_team *team)
290 {
291   struct gomp_thread_start_data *start_data;
292   struct gomp_thread *thr, *nthr;
293   struct gomp_task *task;
294   struct gomp_task_icv *icv;
295   bool nested;
296   struct gomp_thread_pool *pool;
297   unsigned i, n, old_threads_used = 0;
298   pthread_attr_t thread_attr, *attr;
299   unsigned long nthreads_var;
300   char bind, bind_var;
301   unsigned int s = 0, rest = 0, p = 0, k = 0;
302   unsigned int affinity_count = 0;
303   struct gomp_thread **affinity_thr = NULL;
304 
305   thr = gomp_thread ();
306   nested = thr->ts.level;
307   pool = thr->thread_pool;
308   task = thr->task;
309   icv = task ? &task->icv : &gomp_global_icv;
310   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
311     gomp_init_affinity ();
312 
313   /* Always save the previous state, even if this isn't a nested team.
314      In particular, we should save any work share state from an outer
315      orphaned work share construct.  */
316   team->prev_ts = thr->ts;
317 
318   thr->ts.team = team;
319   thr->ts.team_id = 0;
320   ++thr->ts.level;
321   if (nthreads > 1)
322     ++thr->ts.active_level;
323   thr->ts.work_share = &team->work_shares[0];
324   thr->ts.last_work_share = NULL;
325 #ifdef HAVE_SYNC_BUILTINS
326   thr->ts.single_count = 0;
327 #endif
328   thr->ts.static_trip = 0;
329   thr->task = &team->implicit_task[0];
330   nthreads_var = icv->nthreads_var;
331   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
332       && thr->ts.level < gomp_nthreads_var_list_len)
333     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
334   bind_var = icv->bind_var;
335   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
336     bind_var = flags & 7;
337   bind = bind_var;
338   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
339       && thr->ts.level < gomp_bind_var_list_len)
340     bind_var = gomp_bind_var_list[thr->ts.level];
341   gomp_init_task (thr->task, task, icv);
342   team->implicit_task[0].icv.nthreads_var = nthreads_var;
343   team->implicit_task[0].icv.bind_var = bind_var;
344 
345   if (nthreads == 1)
346     return;
347 
348   i = 1;
349 
350   if (__builtin_expect (gomp_places_list != NULL, 0))
351     {
352       /* Depending on chosen proc_bind model, set subpartition
353 	 for the master thread and initialize helper variables
354 	 P and optionally S, K and/or REST used by later place
355 	 computation for each additional thread.  */
356       p = thr->place - 1;
357       switch (bind)
358 	{
359 	case omp_proc_bind_true:
360 	case omp_proc_bind_close:
361 	  if (nthreads > thr->ts.place_partition_len)
362 	    {
363 	      /* T > P.  S threads will be placed in each place,
364 		 and the final REM threads placed one by one
365 		 into the already occupied places.  */
366 	      s = nthreads / thr->ts.place_partition_len;
367 	      rest = nthreads % thr->ts.place_partition_len;
368 	    }
369 	  else
370 	    s = 1;
371 	  k = 1;
372 	  break;
373 	case omp_proc_bind_master:
374 	  /* Each thread will be bound to master's place.  */
375 	  break;
376 	case omp_proc_bind_spread:
377 	  if (nthreads <= thr->ts.place_partition_len)
378 	    {
379 	      /* T <= P.  Each subpartition will have in between s
380 		 and s+1 places (subpartitions starting at or
381 		 after rest will have s places, earlier s+1 places),
382 		 each thread will be bound to the first place in
383 		 its subpartition (except for the master thread
384 		 that can be bound to another place in its
385 		 subpartition).  */
386 	      s = thr->ts.place_partition_len / nthreads;
387 	      rest = thr->ts.place_partition_len % nthreads;
388 	      rest = (s + 1) * rest + thr->ts.place_partition_off;
389 	      if (p < rest)
390 		{
391 		  p -= (p - thr->ts.place_partition_off) % (s + 1);
392 		  thr->ts.place_partition_len = s + 1;
393 		}
394 	      else
395 		{
396 		  p -= (p - rest) % s;
397 		  thr->ts.place_partition_len = s;
398 		}
399 	      thr->ts.place_partition_off = p;
400 	    }
401 	  else
402 	    {
403 	      /* T > P.  Each subpartition will have just a single
404 		 place and we'll place between s and s+1
405 		 threads into each subpartition.  */
406 	      s = nthreads / thr->ts.place_partition_len;
407 	      rest = nthreads % thr->ts.place_partition_len;
408 	      thr->ts.place_partition_off = p;
409 	      thr->ts.place_partition_len = 1;
410 	      k = 1;
411 	    }
412 	  break;
413 	}
414     }
415   else
416     bind = omp_proc_bind_false;
417 
418   /* We only allow the reuse of idle threads for non-nested PARALLEL
419      regions.  This appears to be implied by the semantics of
420      threadprivate variables, but perhaps that's reading too much into
421      things.  Certainly it does prevent any locking problems, since
422      only the initial program thread will modify gomp_threads.  */
423   if (!nested)
424     {
425       old_threads_used = pool->threads_used;
426 
427       if (nthreads <= old_threads_used)
428 	n = nthreads;
429       else if (old_threads_used == 0)
430 	{
431 	  n = 0;
432 	  gomp_barrier_init (&pool->threads_dock, nthreads);
433 	}
434       else
435 	{
436 	  n = old_threads_used;
437 
438 	  /* Increase the barrier threshold to make sure all new
439 	     threads arrive before the team is released.  */
440 	  gomp_barrier_reinit (&pool->threads_dock, nthreads);
441 	}
442 
443       /* Not true yet, but soon will be.  We're going to release all
444 	 threads from the dock, and those that aren't part of the
445 	 team will exit.  */
446       pool->threads_used = nthreads;
447 
448       /* If necessary, expand the size of the gomp_threads array.  It is
449 	 expected that changes in the number of threads are rare, thus we
450 	 make no effort to expand gomp_threads_size geometrically.  */
451       if (nthreads >= pool->threads_size)
452 	{
453 	  pool->threads_size = nthreads + 1;
454 	  pool->threads
455 	    = gomp_realloc (pool->threads,
456 			    pool->threads_size
457 			    * sizeof (struct gomp_thread_data *));
458 	}
459 
460       /* Release existing idle threads.  */
461       for (; i < n; ++i)
462 	{
463 	  unsigned int place_partition_off = thr->ts.place_partition_off;
464 	  unsigned int place_partition_len = thr->ts.place_partition_len;
465 	  unsigned int place = 0;
466 	  if (__builtin_expect (gomp_places_list != NULL, 0))
467 	    {
468 	      switch (bind)
469 		{
470 		case omp_proc_bind_true:
471 		case omp_proc_bind_close:
472 		  if (k == s)
473 		    {
474 		      ++p;
475 		      if (p == (team->prev_ts.place_partition_off
476 				+ team->prev_ts.place_partition_len))
477 			p = team->prev_ts.place_partition_off;
478 		      k = 1;
479 		      if (i == nthreads - rest)
480 			s = 1;
481 		    }
482 		  else
483 		    ++k;
484 		  break;
485 		case omp_proc_bind_master:
486 		  break;
487 		case omp_proc_bind_spread:
488 		  if (k == 0)
489 		    {
490 		      /* T <= P.  */
491 		      if (p < rest)
492 			p += s + 1;
493 		      else
494 			p += s;
495 		      if (p == (team->prev_ts.place_partition_off
496 				+ team->prev_ts.place_partition_len))
497 			p = team->prev_ts.place_partition_off;
498 		      place_partition_off = p;
499 		      if (p < rest)
500 			place_partition_len = s + 1;
501 		      else
502 			place_partition_len = s;
503 		    }
504 		  else
505 		    {
506 		      /* T > P.  */
507 		      if (k == s)
508 			{
509 			  ++p;
510 			  if (p == (team->prev_ts.place_partition_off
511 				    + team->prev_ts.place_partition_len))
512 			    p = team->prev_ts.place_partition_off;
513 			  k = 1;
514 			  if (i == nthreads - rest)
515 			    s = 1;
516 			}
517 		      else
518 			++k;
519 		      place_partition_off = p;
520 		      place_partition_len = 1;
521 		    }
522 		  break;
523 		}
524 	      if (affinity_thr != NULL
525 		  || (bind != omp_proc_bind_true
526 		      && pool->threads[i]->place != p + 1)
527 		  || pool->threads[i]->place <= place_partition_off
528 		  || pool->threads[i]->place > (place_partition_off
529 						+ place_partition_len))
530 		{
531 		  unsigned int l;
532 		  if (affinity_thr == NULL)
533 		    {
534 		      unsigned int j;
535 
536 		      if (team->prev_ts.place_partition_len > 64)
537 			affinity_thr
538 			  = gomp_malloc (team->prev_ts.place_partition_len
539 					 * sizeof (struct gomp_thread *));
540 		      else
541 			affinity_thr
542 			  = gomp_alloca (team->prev_ts.place_partition_len
543 					 * sizeof (struct gomp_thread *));
544 		      memset (affinity_thr, '\0',
545 			      team->prev_ts.place_partition_len
546 			      * sizeof (struct gomp_thread *));
547 		      for (j = i; j < old_threads_used; j++)
548 			{
549 			  if (pool->threads[j]->place
550 			      > team->prev_ts.place_partition_off
551 			      && (pool->threads[j]->place
552 				  <= (team->prev_ts.place_partition_off
553 				      + team->prev_ts.place_partition_len)))
554 			    {
555 			      l = pool->threads[j]->place - 1
556 				  - team->prev_ts.place_partition_off;
557 			      pool->threads[j]->data = affinity_thr[l];
558 			      affinity_thr[l] = pool->threads[j];
559 			    }
560 			  pool->threads[j] = NULL;
561 			}
562 		      if (nthreads > old_threads_used)
563 			memset (&pool->threads[old_threads_used],
564 				'\0', ((nthreads - old_threads_used)
565 				       * sizeof (struct gomp_thread *)));
566 		      n = nthreads;
567 		      affinity_count = old_threads_used - i;
568 		    }
569 		  if (affinity_count == 0)
570 		    break;
571 		  l = p;
572 		  if (affinity_thr[l - team->prev_ts.place_partition_off]
573 		      == NULL)
574 		    {
575 		      if (bind != omp_proc_bind_true)
576 			continue;
577 		      for (l = place_partition_off;
578 			   l < place_partition_off + place_partition_len;
579 			   l++)
580 			if (affinity_thr[l - team->prev_ts.place_partition_off]
581 			    != NULL)
582 			  break;
583 		      if (l == place_partition_off + place_partition_len)
584 			continue;
585 		    }
586 		  nthr = affinity_thr[l - team->prev_ts.place_partition_off];
587 		  affinity_thr[l - team->prev_ts.place_partition_off]
588 		    = (struct gomp_thread *) nthr->data;
589 		  affinity_count--;
590 		  pool->threads[i] = nthr;
591 		}
592 	      else
593 		nthr = pool->threads[i];
594 	      place = p + 1;
595 	    }
596 	  else
597 	    nthr = pool->threads[i];
598 	  nthr->ts.team = team;
599 	  nthr->ts.work_share = &team->work_shares[0];
600 	  nthr->ts.last_work_share = NULL;
601 	  nthr->ts.team_id = i;
602 	  nthr->ts.level = team->prev_ts.level + 1;
603 	  nthr->ts.active_level = thr->ts.active_level;
604 	  nthr->ts.place_partition_off = place_partition_off;
605 	  nthr->ts.place_partition_len = place_partition_len;
606 #ifdef HAVE_SYNC_BUILTINS
607 	  nthr->ts.single_count = 0;
608 #endif
609 	  nthr->ts.static_trip = 0;
610 	  nthr->task = &team->implicit_task[i];
611 	  nthr->place = place;
612 	  gomp_init_task (nthr->task, task, icv);
613 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
614 	  team->implicit_task[i].icv.bind_var = bind_var;
615 	  nthr->fn = fn;
616 	  nthr->data = data;
617 	  team->ordered_release[i] = &nthr->release;
618 	}
619 
620       if (__builtin_expect (affinity_thr != NULL, 0))
621 	{
622 	  /* If AFFINITY_THR is non-NULL just because we had to
623 	     permute some threads in the pool, but we've managed
624 	     to find exactly as many old threads as we'd find
625 	     without affinity, we don't need to handle this
626 	     specially anymore.  */
627 	  if (nthreads <= old_threads_used
628 	      ? (affinity_count == old_threads_used - nthreads)
629 	      : (i == old_threads_used))
630 	    {
631 	      if (team->prev_ts.place_partition_len > 64)
632 		free (affinity_thr);
633 	      affinity_thr = NULL;
634 	      affinity_count = 0;
635 	    }
636 	  else
637 	    {
638 	      i = 1;
639 	      /* We are going to compute the places/subpartitions
640 		 again from the beginning.  So, we need to reinitialize
641 		 vars modified by the switch (bind) above inside
642 		 of the loop, to the state they had after the initial
643 		 switch (bind).  */
644 	      switch (bind)
645 		{
646 		case omp_proc_bind_true:
647 		case omp_proc_bind_close:
648 		  if (nthreads > thr->ts.place_partition_len)
649 		    /* T > P.  S has been changed, so needs
650 		       to be recomputed.  */
651 		    s = nthreads / thr->ts.place_partition_len;
652 		  k = 1;
653 		  p = thr->place - 1;
654 		  break;
655 		case omp_proc_bind_master:
656 		  /* No vars have been changed.  */
657 		  break;
658 		case omp_proc_bind_spread:
659 		  p = thr->ts.place_partition_off;
660 		  if (k != 0)
661 		    {
662 		      /* T > P.  */
663 		      s = nthreads / team->prev_ts.place_partition_len;
664 		      k = 1;
665 		    }
666 		  break;
667 		}
668 
669 	      /* Increase the barrier threshold to make sure all new
670 		 threads and all the threads we're going to let die
671 		 arrive before the team is released.  */
672 	      if (affinity_count)
673 		gomp_barrier_reinit (&pool->threads_dock,
674 				     nthreads + affinity_count);
675 	    }
676 	}
677 
678       if (i == nthreads)
679 	goto do_release;
680 
681     }
682 
683   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
684     {
685       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
686 
687       if (old_threads_used == 0)
688 	--diff;
689 
690 #ifdef HAVE_SYNC_BUILTINS
691       __sync_fetch_and_add (&gomp_managed_threads, diff);
692 #else
693       gomp_mutex_lock (&gomp_managed_threads_lock);
694       gomp_managed_threads += diff;
695       gomp_mutex_unlock (&gomp_managed_threads_lock);
696 #endif
697     }
698 
699   attr = &gomp_thread_attr;
700   if (__builtin_expect (gomp_places_list != NULL, 0))
701     {
702       size_t stacksize;
703       pthread_attr_init (&thread_attr);
704       pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
705       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
706 	pthread_attr_setstacksize (&thread_attr, stacksize);
707       attr = &thread_attr;
708     }
709 
710   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
711 			    * (nthreads-i));
712 
713   /* Launch new threads.  */
714   for (; i < nthreads; ++i)
715     {
716       pthread_t pt;
717       int err;
718 
719       start_data->ts.place_partition_off = thr->ts.place_partition_off;
720       start_data->ts.place_partition_len = thr->ts.place_partition_len;
721       start_data->place = 0;
722       if (__builtin_expect (gomp_places_list != NULL, 0))
723 	{
724 	  switch (bind)
725 	    {
726 	    case omp_proc_bind_true:
727 	    case omp_proc_bind_close:
728 	      if (k == s)
729 		{
730 		  ++p;
731 		  if (p == (team->prev_ts.place_partition_off
732 			    + team->prev_ts.place_partition_len))
733 		    p = team->prev_ts.place_partition_off;
734 		  k = 1;
735 		  if (i == nthreads - rest)
736 		    s = 1;
737 		}
738 	      else
739 		++k;
740 	      break;
741 	    case omp_proc_bind_master:
742 	      break;
743 	    case omp_proc_bind_spread:
744 	      if (k == 0)
745 		{
746 		  /* T <= P.  */
747 		  if (p < rest)
748 		    p += s + 1;
749 		  else
750 		    p += s;
751 		  if (p == (team->prev_ts.place_partition_off
752 			    + team->prev_ts.place_partition_len))
753 		    p = team->prev_ts.place_partition_off;
754 		  start_data->ts.place_partition_off = p;
755 		  if (p < rest)
756 		    start_data->ts.place_partition_len = s + 1;
757 		  else
758 		    start_data->ts.place_partition_len = s;
759 		}
760 	      else
761 		{
762 		  /* T > P.  */
763 		  if (k == s)
764 		    {
765 		      ++p;
766 		      if (p == (team->prev_ts.place_partition_off
767 				+ team->prev_ts.place_partition_len))
768 			p = team->prev_ts.place_partition_off;
769 		      k = 1;
770 		      if (i == nthreads - rest)
771 			s = 1;
772 		    }
773 		  else
774 		    ++k;
775 		  start_data->ts.place_partition_off = p;
776 		  start_data->ts.place_partition_len = 1;
777 		}
778 	      break;
779 	    }
780 	  start_data->place = p + 1;
781 	  if (affinity_thr != NULL && pool->threads[i] != NULL)
782 	    continue;
783 	  gomp_init_thread_affinity (attr, p);
784 	}
785 
786       start_data->fn = fn;
787       start_data->fn_data = data;
788       start_data->ts.team = team;
789       start_data->ts.work_share = &team->work_shares[0];
790       start_data->ts.last_work_share = NULL;
791       start_data->ts.team_id = i;
792       start_data->ts.level = team->prev_ts.level + 1;
793       start_data->ts.active_level = thr->ts.active_level;
794 #ifdef HAVE_SYNC_BUILTINS
795       start_data->ts.single_count = 0;
796 #endif
797       start_data->ts.static_trip = 0;
798       start_data->task = &team->implicit_task[i];
799       gomp_init_task (start_data->task, task, icv);
800       team->implicit_task[i].icv.nthreads_var = nthreads_var;
801       team->implicit_task[i].icv.bind_var = bind_var;
802       start_data->thread_pool = pool;
803       start_data->nested = nested;
804 
805       attr = gomp_adjust_thread_attr (attr, &thread_attr);
806       err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
807       if (err != 0)
808 	gomp_fatal ("Thread creation failed: %s", strerror (err));
809     }
810 
811   if (__builtin_expect (attr == &thread_attr, 0))
812     pthread_attr_destroy (&thread_attr);
813 
814  do_release:
815   gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
816 
817   /* Decrease the barrier threshold to match the number of threads
818      that should arrive back at the end of this team.  The extra
819      threads should be exiting.  Note that we arrange for this test
820      to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
821      the barrier as well as gomp_managed_threads was temporarily
822      set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
823      AFFINITY_COUNT if non-zero will be always at least
824      OLD_THREADS_COUNT - NTHREADS.  */
825   if (__builtin_expect (nthreads < old_threads_used, 0)
826       || __builtin_expect (affinity_count, 0))
827     {
828       long diff = (long) nthreads - (long) old_threads_used;
829 
830       if (affinity_count)
831 	diff = -affinity_count;
832 
833       gomp_barrier_reinit (&pool->threads_dock, nthreads);
834 
835 #ifdef HAVE_SYNC_BUILTINS
836       __sync_fetch_and_add (&gomp_managed_threads, diff);
837 #else
838       gomp_mutex_lock (&gomp_managed_threads_lock);
839       gomp_managed_threads += diff;
840       gomp_mutex_unlock (&gomp_managed_threads_lock);
841 #endif
842     }
843   if (__builtin_expect (affinity_thr != NULL, 0)
844       && team->prev_ts.place_partition_len > 64)
845     free (affinity_thr);
846 }
847 
848 
849 /* Terminate the current team.  This is only to be called by the master
850    thread.  We assume that we must wait for the other threads.  */
851 
852 void
853 gomp_team_end (void)
854 {
855   struct gomp_thread *thr = gomp_thread ();
856   struct gomp_team *team = thr->ts.team;
857 
858   /* This barrier handles all pending explicit threads.
859      As #pragma omp cancel parallel might get awaited count in
860      team->barrier in a inconsistent state, we need to use a different
861      counter here.  */
862   gomp_team_barrier_wait_final (&team->barrier);
863   if (__builtin_expect (team->team_cancelled, 0))
864     {
865       struct gomp_work_share *ws = team->work_shares_to_free;
866       do
867 	{
868 	  struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
869 	  if (next_ws == NULL)
870 	    gomp_ptrlock_set (&ws->next_ws, ws);
871 	  gomp_fini_work_share (ws);
872 	  ws = next_ws;
873 	}
874       while (ws != NULL);
875     }
876   else
877     gomp_fini_work_share (thr->ts.work_share);
878 
879   gomp_end_task ();
880   thr->ts = team->prev_ts;
881 
882   if (__builtin_expect (thr->ts.team != NULL, 0))
883     {
884 #ifdef HAVE_SYNC_BUILTINS
885       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
886 #else
887       gomp_mutex_lock (&gomp_managed_threads_lock);
888       gomp_managed_threads -= team->nthreads - 1L;
889       gomp_mutex_unlock (&gomp_managed_threads_lock);
890 #endif
891       /* This barrier has gomp_barrier_wait_last counterparts
892 	 and ensures the team can be safely destroyed.  */
893       gomp_barrier_wait (&team->barrier);
894     }
895 
896   if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
897     {
898       struct gomp_work_share *ws = team->work_shares[0].next_alloc;
899       do
900 	{
901 	  struct gomp_work_share *next_ws = ws->next_alloc;
902 	  free (ws);
903 	  ws = next_ws;
904 	}
905       while (ws != NULL);
906     }
907   gomp_sem_destroy (&team->master_release);
908 
909   if (__builtin_expect (thr->ts.team != NULL, 0)
910       || __builtin_expect (team->nthreads == 1, 0))
911     free_team (team);
912   else
913     {
914       struct gomp_thread_pool *pool = thr->thread_pool;
915       if (pool->last_team)
916 	free_team (pool->last_team);
917       pool->last_team = team;
918       gomp_release_thread_pool (pool);
919     }
920 }
921 
922 
923 /* Constructors for this file.  */
924 
925 static void __attribute__((constructor))
926 initialize_team (void)
927 {
928 #if !defined HAVE_TLS && !defined USE_EMUTLS
929   static struct gomp_thread initial_thread_tls_data;
930 
931   pthread_key_create (&gomp_tls_key, NULL);
932   pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
933 #endif
934 
935   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
936     gomp_fatal ("could not create thread pool destructor.");
937 }
938 
939 static void __attribute__((destructor))
940 team_destructor (void)
941 {
942   /* Without this dlclose on libgomp could lead to subsequent
943      crashes.  */
944   pthread_key_delete (gomp_thread_destructor);
945 }
946 
947 struct gomp_task_icv *
948 gomp_new_icv (void)
949 {
950   struct gomp_thread *thr = gomp_thread ();
951   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
952   gomp_init_task (task, NULL, &gomp_global_icv);
953   thr->task = task;
954   pthread_setspecific (gomp_thread_destructor, thr);
955   return &task->icv;
956 }
957