1*e4b17023SJohn Marino // -*- C++ -*- 2*e4b17023SJohn Marino 3*e4b17023SJohn Marino // Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. 4*e4b17023SJohn Marino // 5*e4b17023SJohn Marino // This file is part of the GNU ISO C++ Library. This library is free 6*e4b17023SJohn Marino // software; you can redistribute it and/or modify it under the terms 7*e4b17023SJohn Marino // of the GNU General Public License as published by the Free Software 8*e4b17023SJohn Marino // Foundation; either version 3, or (at your option) any later 9*e4b17023SJohn Marino // version. 10*e4b17023SJohn Marino 11*e4b17023SJohn Marino // This library is distributed in the hope that it will be useful, but 12*e4b17023SJohn Marino // WITHOUT ANY WARRANTY; without even the implied warranty of 13*e4b17023SJohn Marino // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*e4b17023SJohn Marino // General Public License for more details. 15*e4b17023SJohn Marino 16*e4b17023SJohn Marino // Under Section 7 of GPL version 3, you are granted additional 17*e4b17023SJohn Marino // permissions described in the GCC Runtime Library Exception, version 18*e4b17023SJohn Marino // 3.1, as published by the Free Software Foundation. 19*e4b17023SJohn Marino 20*e4b17023SJohn Marino // You should have received a copy of the GNU General Public License and 21*e4b17023SJohn Marino // a copy of the GCC Runtime Library Exception along with this program; 22*e4b17023SJohn Marino // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*e4b17023SJohn Marino // <http://www.gnu.org/licenses/>. 24*e4b17023SJohn Marino 25*e4b17023SJohn Marino /** @file parallel/workstealing.h 26*e4b17023SJohn Marino * @brief Parallelization of embarrassingly parallel execution by 27*e4b17023SJohn Marino * means of work-stealing. 28*e4b17023SJohn Marino * 29*e4b17023SJohn Marino * Work stealing is described in 30*e4b17023SJohn Marino * 31*e4b17023SJohn Marino * R. D. Blumofe and C. E. Leiserson. 32*e4b17023SJohn Marino * Scheduling multithreaded computations by work stealing. 33*e4b17023SJohn Marino * Journal of the ACM, 46(5):720–748, 1999. 34*e4b17023SJohn Marino * 35*e4b17023SJohn Marino * This file is a GNU parallel extension to the Standard C++ Library. 36*e4b17023SJohn Marino */ 37*e4b17023SJohn Marino 38*e4b17023SJohn Marino // Written by Felix Putze. 39*e4b17023SJohn Marino 40*e4b17023SJohn Marino #ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H 41*e4b17023SJohn Marino #define _GLIBCXX_PARALLEL_WORKSTEALING_H 1 42*e4b17023SJohn Marino 43*e4b17023SJohn Marino #include <parallel/parallel.h> 44*e4b17023SJohn Marino #include <parallel/random_number.h> 45*e4b17023SJohn Marino #include <parallel/compatibility.h> 46*e4b17023SJohn Marino 47*e4b17023SJohn Marino namespace __gnu_parallel 48*e4b17023SJohn Marino { 49*e4b17023SJohn Marino 50*e4b17023SJohn Marino #define _GLIBCXX_JOB_VOLATILE volatile 51*e4b17023SJohn Marino 52*e4b17023SJohn Marino /** @brief One __job for a certain thread. */ 53*e4b17023SJohn Marino template<typename _DifferenceTp> 54*e4b17023SJohn Marino struct _Job 55*e4b17023SJohn Marino { 56*e4b17023SJohn Marino typedef _DifferenceTp _DifferenceType; 57*e4b17023SJohn Marino 58*e4b17023SJohn Marino /** @brief First element. 59*e4b17023SJohn Marino * 60*e4b17023SJohn Marino * Changed by owning and stealing thread. By stealing thread, 61*e4b17023SJohn Marino * always incremented. */ 62*e4b17023SJohn Marino _GLIBCXX_JOB_VOLATILE _DifferenceType _M_first; 63*e4b17023SJohn Marino 64*e4b17023SJohn Marino /** @brief Last element. 65*e4b17023SJohn Marino * 66*e4b17023SJohn Marino * Changed by owning thread only. */ 67*e4b17023SJohn Marino _GLIBCXX_JOB_VOLATILE _DifferenceType _M_last; 68*e4b17023SJohn Marino 69*e4b17023SJohn Marino /** @brief Number of elements, i.e. @c _M_last-_M_first+1. 70*e4b17023SJohn Marino * 71*e4b17023SJohn Marino * Changed by owning thread only. */ 72*e4b17023SJohn Marino _GLIBCXX_JOB_VOLATILE _DifferenceType _M_load; 73*e4b17023SJohn Marino }; 74*e4b17023SJohn Marino 75*e4b17023SJohn Marino /** @brief Work stealing algorithm for random access iterators. 76*e4b17023SJohn Marino * 77*e4b17023SJohn Marino * Uses O(1) additional memory. Synchronization at job lists is 78*e4b17023SJohn Marino * done with atomic operations. 79*e4b17023SJohn Marino * @param __begin Begin iterator of element sequence. 80*e4b17023SJohn Marino * @param __end End iterator of element sequence. 81*e4b17023SJohn Marino * @param __op User-supplied functor (comparator, predicate, adding 82*e4b17023SJohn Marino * functor, ...). 83*e4b17023SJohn Marino * @param __f Functor to @a process an element with __op (depends on 84*e4b17023SJohn Marino * desired functionality, e. g. for std::for_each(), ...). 85*e4b17023SJohn Marino * @param __r Functor to @a add a single __result to the already 86*e4b17023SJohn Marino * processed elements (depends on functionality). 87*e4b17023SJohn Marino * @param __base Base value for reduction. 88*e4b17023SJohn Marino * @param __output Pointer to position where final result is written to 89*e4b17023SJohn Marino * @param __bound Maximum number of elements processed (e. g. for 90*e4b17023SJohn Marino * std::count_n()). 91*e4b17023SJohn Marino * @return User-supplied functor (that may contain a part of the result). 92*e4b17023SJohn Marino */ 93*e4b17023SJohn Marino template<typename _RAIter, 94*e4b17023SJohn Marino typename _Op, 95*e4b17023SJohn Marino typename _Fu, 96*e4b17023SJohn Marino typename _Red, 97*e4b17023SJohn Marino typename _Result> 98*e4b17023SJohn Marino _Op __for_each_template_random_access_workstealing(_RAIter __begin,_RAIter __end,_Op __op,_Fu & __f,_Red __r,_Result __base,_Result & __output,typename std::iterator_traits<_RAIter>::difference_type __bound)99*e4b17023SJohn Marino __for_each_template_random_access_workstealing(_RAIter __begin, 100*e4b17023SJohn Marino _RAIter __end, _Op __op, 101*e4b17023SJohn Marino _Fu& __f, _Red __r, 102*e4b17023SJohn Marino _Result __base, 103*e4b17023SJohn Marino _Result& __output, 104*e4b17023SJohn Marino typename std::iterator_traits<_RAIter>::difference_type __bound) 105*e4b17023SJohn Marino { 106*e4b17023SJohn Marino _GLIBCXX_CALL(__end - __begin) 107*e4b17023SJohn Marino 108*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 109*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 110*e4b17023SJohn Marino 111*e4b17023SJohn Marino const _Settings& __s = _Settings::get(); 112*e4b17023SJohn Marino 113*e4b17023SJohn Marino _DifferenceType __chunk_size = 114*e4b17023SJohn Marino static_cast<_DifferenceType>(__s.workstealing_chunk_size); 115*e4b17023SJohn Marino 116*e4b17023SJohn Marino // How many jobs? 117*e4b17023SJohn Marino _DifferenceType __length = (__bound < 0) ? (__end - __begin) : __bound; 118*e4b17023SJohn Marino 119*e4b17023SJohn Marino // To avoid false sharing in a cache line. 120*e4b17023SJohn Marino const int __stride = (__s.cache_line_size * 10 121*e4b17023SJohn Marino / sizeof(_Job<_DifferenceType>) + 1); 122*e4b17023SJohn Marino 123*e4b17023SJohn Marino // Total number of threads currently working. 124*e4b17023SJohn Marino _ThreadIndex __busy = 0; 125*e4b17023SJohn Marino 126*e4b17023SJohn Marino _Job<_DifferenceType> *__job; 127*e4b17023SJohn Marino 128*e4b17023SJohn Marino omp_lock_t __output_lock; 129*e4b17023SJohn Marino omp_init_lock(&__output_lock); 130*e4b17023SJohn Marino 131*e4b17023SJohn Marino // Write base value to output. 132*e4b17023SJohn Marino __output = __base; 133*e4b17023SJohn Marino 134*e4b17023SJohn Marino // No more threads than jobs, at least one thread. 135*e4b17023SJohn Marino _ThreadIndex __num_threads = __gnu_parallel::max<_ThreadIndex> 136*e4b17023SJohn Marino (1, __gnu_parallel::min<_DifferenceType>(__length, 137*e4b17023SJohn Marino __get_max_threads())); 138*e4b17023SJohn Marino 139*e4b17023SJohn Marino # pragma omp parallel shared(__busy) num_threads(__num_threads) 140*e4b17023SJohn Marino { 141*e4b17023SJohn Marino # pragma omp single 142*e4b17023SJohn Marino { 143*e4b17023SJohn Marino __num_threads = omp_get_num_threads(); 144*e4b17023SJohn Marino 145*e4b17023SJohn Marino // Create job description array. 146*e4b17023SJohn Marino __job = new _Job<_DifferenceType>[__num_threads * __stride]; 147*e4b17023SJohn Marino } 148*e4b17023SJohn Marino 149*e4b17023SJohn Marino // Initialization phase. 150*e4b17023SJohn Marino 151*e4b17023SJohn Marino // Flags for every thread if it is doing productive work. 152*e4b17023SJohn Marino bool __iam_working = false; 153*e4b17023SJohn Marino 154*e4b17023SJohn Marino // Thread id. 155*e4b17023SJohn Marino _ThreadIndex __iam = omp_get_thread_num(); 156*e4b17023SJohn Marino 157*e4b17023SJohn Marino // This job. 158*e4b17023SJohn Marino _Job<_DifferenceType>& __my_job = __job[__iam * __stride]; 159*e4b17023SJohn Marino 160*e4b17023SJohn Marino // Random number (for work stealing). 161*e4b17023SJohn Marino _ThreadIndex __victim; 162*e4b17023SJohn Marino 163*e4b17023SJohn Marino // Local value for reduction. 164*e4b17023SJohn Marino _Result __result = _Result(); 165*e4b17023SJohn Marino 166*e4b17023SJohn Marino // Number of elements to steal in one attempt. 167*e4b17023SJohn Marino _DifferenceType __steal; 168*e4b17023SJohn Marino 169*e4b17023SJohn Marino // Every thread has its own random number generator 170*e4b17023SJohn Marino // (modulo __num_threads). 171*e4b17023SJohn Marino _RandomNumber __rand_gen(__iam, __num_threads); 172*e4b17023SJohn Marino 173*e4b17023SJohn Marino // This thread is currently working. 174*e4b17023SJohn Marino # pragma omp atomic 175*e4b17023SJohn Marino ++__busy; 176*e4b17023SJohn Marino 177*e4b17023SJohn Marino __iam_working = true; 178*e4b17023SJohn Marino 179*e4b17023SJohn Marino // How many jobs per thread? last thread gets the rest. 180*e4b17023SJohn Marino __my_job._M_first = static_cast<_DifferenceType> 181*e4b17023SJohn Marino (__iam * (__length / __num_threads)); 182*e4b17023SJohn Marino 183*e4b17023SJohn Marino __my_job._M_last = (__iam == (__num_threads - 1) 184*e4b17023SJohn Marino ? (__length - 1) 185*e4b17023SJohn Marino : ((__iam + 1) * (__length / __num_threads) - 1)); 186*e4b17023SJohn Marino __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; 187*e4b17023SJohn Marino 188*e4b17023SJohn Marino // Init result with _M_first value (to have a base value for reduction) 189*e4b17023SJohn Marino if (__my_job._M_first <= __my_job._M_last) 190*e4b17023SJohn Marino { 191*e4b17023SJohn Marino // Cannot use volatile variable directly. 192*e4b17023SJohn Marino _DifferenceType __my_first = __my_job._M_first; 193*e4b17023SJohn Marino __result = __f(__op, __begin + __my_first); 194*e4b17023SJohn Marino ++__my_job._M_first; 195*e4b17023SJohn Marino --__my_job._M_load; 196*e4b17023SJohn Marino } 197*e4b17023SJohn Marino 198*e4b17023SJohn Marino _RAIter __current; 199*e4b17023SJohn Marino 200*e4b17023SJohn Marino # pragma omp barrier 201*e4b17023SJohn Marino 202*e4b17023SJohn Marino // Actual work phase 203*e4b17023SJohn Marino // Work on own or stolen current start 204*e4b17023SJohn Marino while (__busy > 0) 205*e4b17023SJohn Marino { 206*e4b17023SJohn Marino // Work until no productive thread left. 207*e4b17023SJohn Marino # pragma omp flush(__busy) 208*e4b17023SJohn Marino 209*e4b17023SJohn Marino // Thread has own work to do 210*e4b17023SJohn Marino while (__my_job._M_first <= __my_job._M_last) 211*e4b17023SJohn Marino { 212*e4b17023SJohn Marino // fetch-and-add call 213*e4b17023SJohn Marino // Reserve current job block (size __chunk_size) in my queue. 214*e4b17023SJohn Marino _DifferenceType __current_job = 215*e4b17023SJohn Marino __fetch_and_add<_DifferenceType>(&(__my_job._M_first), 216*e4b17023SJohn Marino __chunk_size); 217*e4b17023SJohn Marino 218*e4b17023SJohn Marino // Update _M_load, to make the three values consistent, 219*e4b17023SJohn Marino // _M_first might have been changed in the meantime 220*e4b17023SJohn Marino __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; 221*e4b17023SJohn Marino for (_DifferenceType __job_counter = 0; 222*e4b17023SJohn Marino __job_counter < __chunk_size 223*e4b17023SJohn Marino && __current_job <= __my_job._M_last; 224*e4b17023SJohn Marino ++__job_counter) 225*e4b17023SJohn Marino { 226*e4b17023SJohn Marino // Yes: process it! 227*e4b17023SJohn Marino __current = __begin + __current_job; 228*e4b17023SJohn Marino ++__current_job; 229*e4b17023SJohn Marino 230*e4b17023SJohn Marino // Do actual work. 231*e4b17023SJohn Marino __result = __r(__result, __f(__op, __current)); 232*e4b17023SJohn Marino } 233*e4b17023SJohn Marino 234*e4b17023SJohn Marino # pragma omp flush(__busy) 235*e4b17023SJohn Marino } 236*e4b17023SJohn Marino 237*e4b17023SJohn Marino // After reaching this point, a thread's __job list is empty. 238*e4b17023SJohn Marino if (__iam_working) 239*e4b17023SJohn Marino { 240*e4b17023SJohn Marino // This thread no longer has work. 241*e4b17023SJohn Marino # pragma omp atomic 242*e4b17023SJohn Marino --__busy; 243*e4b17023SJohn Marino 244*e4b17023SJohn Marino __iam_working = false; 245*e4b17023SJohn Marino } 246*e4b17023SJohn Marino 247*e4b17023SJohn Marino _DifferenceType __supposed_first, __supposed_last, 248*e4b17023SJohn Marino __supposed_load; 249*e4b17023SJohn Marino do 250*e4b17023SJohn Marino { 251*e4b17023SJohn Marino // Find random nonempty deque (not own), do consistency check. 252*e4b17023SJohn Marino __yield(); 253*e4b17023SJohn Marino # pragma omp flush(__busy) 254*e4b17023SJohn Marino __victim = __rand_gen(); 255*e4b17023SJohn Marino __supposed_first = __job[__victim * __stride]._M_first; 256*e4b17023SJohn Marino __supposed_last = __job[__victim * __stride]._M_last; 257*e4b17023SJohn Marino __supposed_load = __job[__victim * __stride]._M_load; 258*e4b17023SJohn Marino } 259*e4b17023SJohn Marino while (__busy > 0 260*e4b17023SJohn Marino && ((__supposed_load <= 0) 261*e4b17023SJohn Marino || ((__supposed_first + __supposed_load - 1) 262*e4b17023SJohn Marino != __supposed_last))); 263*e4b17023SJohn Marino 264*e4b17023SJohn Marino if (__busy == 0) 265*e4b17023SJohn Marino break; 266*e4b17023SJohn Marino 267*e4b17023SJohn Marino if (__supposed_load > 0) 268*e4b17023SJohn Marino { 269*e4b17023SJohn Marino // Has work and work to do. 270*e4b17023SJohn Marino // Number of elements to steal (at least one). 271*e4b17023SJohn Marino __steal = (__supposed_load < 2) ? 1 : __supposed_load / 2; 272*e4b17023SJohn Marino 273*e4b17023SJohn Marino // Push __victim's current start forward. 274*e4b17023SJohn Marino _DifferenceType __stolen_first = 275*e4b17023SJohn Marino __fetch_and_add<_DifferenceType> 276*e4b17023SJohn Marino (&(__job[__victim * __stride]._M_first), __steal); 277*e4b17023SJohn Marino _DifferenceType __stolen_try = (__stolen_first + __steal 278*e4b17023SJohn Marino - _DifferenceType(1)); 279*e4b17023SJohn Marino 280*e4b17023SJohn Marino __my_job._M_first = __stolen_first; 281*e4b17023SJohn Marino __my_job._M_last = __gnu_parallel::min(__stolen_try, 282*e4b17023SJohn Marino __supposed_last); 283*e4b17023SJohn Marino __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; 284*e4b17023SJohn Marino 285*e4b17023SJohn Marino // Has potential work again. 286*e4b17023SJohn Marino # pragma omp atomic 287*e4b17023SJohn Marino ++__busy; 288*e4b17023SJohn Marino __iam_working = true; 289*e4b17023SJohn Marino 290*e4b17023SJohn Marino # pragma omp flush(__busy) 291*e4b17023SJohn Marino } 292*e4b17023SJohn Marino # pragma omp flush(__busy) 293*e4b17023SJohn Marino } // end while __busy > 0 294*e4b17023SJohn Marino // Add accumulated result to output. 295*e4b17023SJohn Marino omp_set_lock(&__output_lock); 296*e4b17023SJohn Marino __output = __r(__output, __result); 297*e4b17023SJohn Marino omp_unset_lock(&__output_lock); 298*e4b17023SJohn Marino } 299*e4b17023SJohn Marino 300*e4b17023SJohn Marino delete[] __job; 301*e4b17023SJohn Marino 302*e4b17023SJohn Marino // Points to last element processed (needed as return value for 303*e4b17023SJohn Marino // some algorithms like transform) 304*e4b17023SJohn Marino __f._M_finish_iterator = __begin + __length; 305*e4b17023SJohn Marino 306*e4b17023SJohn Marino omp_destroy_lock(&__output_lock); 307*e4b17023SJohn Marino 308*e4b17023SJohn Marino return __op; 309*e4b17023SJohn Marino } 310*e4b17023SJohn Marino } // end namespace 311*e4b17023SJohn Marino 312*e4b17023SJohn Marino #endif /* _GLIBCXX_PARALLEL_WORKSTEALING_H */ 313