1*e4b17023SJohn Marino // -*- C++ -*- 2*e4b17023SJohn Marino 3*e4b17023SJohn Marino // Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. 4*e4b17023SJohn Marino // 5*e4b17023SJohn Marino // This file is part of the GNU ISO C++ Library. This library is free 6*e4b17023SJohn Marino // software; you can redistribute it and/or modify it under the terms 7*e4b17023SJohn Marino // of the GNU General Public License as published by the Free Software 8*e4b17023SJohn Marino // Foundation; either version 3, or (at your option) any later 9*e4b17023SJohn Marino // version. 10*e4b17023SJohn Marino 11*e4b17023SJohn Marino // This library is distributed in the hope that it will be useful, but 12*e4b17023SJohn Marino // WITHOUT ANY WARRANTY; without even the implied warranty of 13*e4b17023SJohn Marino // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*e4b17023SJohn Marino // General Public License for more details. 15*e4b17023SJohn Marino 16*e4b17023SJohn Marino // Under Section 7 of GPL version 3, you are granted additional 17*e4b17023SJohn Marino // permissions described in the GCC Runtime Library Exception, version 18*e4b17023SJohn Marino // 3.1, as published by the Free Software Foundation. 19*e4b17023SJohn Marino 20*e4b17023SJohn Marino // You should have received a copy of the GNU General Public License and 21*e4b17023SJohn Marino // a copy of the GCC Runtime Library Exception along with this program; 22*e4b17023SJohn Marino // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*e4b17023SJohn Marino // <http://www.gnu.org/licenses/>. 24*e4b17023SJohn Marino 25*e4b17023SJohn Marino /** @file parallel/settings.h 26*e4b17023SJohn Marino * @brief Runtime settings and tuning parameters, heuristics to decide 27*e4b17023SJohn Marino * whether to use parallelized algorithms. 28*e4b17023SJohn Marino * This file is a GNU parallel extension to the Standard C++ Library. 29*e4b17023SJohn Marino * 30*e4b17023SJohn Marino * @section parallelization_decision 31*e4b17023SJohn Marino * The decision whether to run an algorithm in parallel. 32*e4b17023SJohn Marino * 33*e4b17023SJohn Marino * There are several ways the user can switch on and __off the parallel 34*e4b17023SJohn Marino * execution of an algorithm, both at compile- and run-time. 35*e4b17023SJohn Marino * 36*e4b17023SJohn Marino * Only sequential execution can be forced at compile-time. This 37*e4b17023SJohn Marino * reduces code size and protects code parts that have 38*e4b17023SJohn Marino * non-thread-safe side effects. 39*e4b17023SJohn Marino * 40*e4b17023SJohn Marino * Ultimately, forcing parallel execution at compile-time makes 41*e4b17023SJohn Marino * sense. Often, the sequential algorithm implementation is used as 42*e4b17023SJohn Marino * a subroutine, so no reduction in code size can be achieved. Also, 43*e4b17023SJohn Marino * the machine the program is run on might have only one processor 44*e4b17023SJohn Marino * core, so to avoid overhead, the algorithm is executed 45*e4b17023SJohn Marino * sequentially. 46*e4b17023SJohn Marino * 47*e4b17023SJohn Marino * To force sequential execution of an algorithm ultimately at 48*e4b17023SJohn Marino * compile-time, the user must add the tag 49*e4b17023SJohn Marino * gnu_parallel::sequential_tag() to the end of the parameter list, 50*e4b17023SJohn Marino * e. g. 51*e4b17023SJohn Marino * 52*e4b17023SJohn Marino * \code 53*e4b17023SJohn Marino * std::sort(__v.begin(), __v.end(), __gnu_parallel::sequential_tag()); 54*e4b17023SJohn Marino * \endcode 55*e4b17023SJohn Marino * 56*e4b17023SJohn Marino * This is compatible with all overloaded algorithm variants. No 57*e4b17023SJohn Marino * additional code will be instantiated, at all. The same holds for 58*e4b17023SJohn Marino * most algorithm calls with iterators not providing random access. 59*e4b17023SJohn Marino * 60*e4b17023SJohn Marino * If the algorithm call is not forced to be executed sequentially 61*e4b17023SJohn Marino * at compile-time, the decision is made at run-time. 62*e4b17023SJohn Marino * The global variable __gnu_parallel::_Settings::algorithm_strategy 63*e4b17023SJohn Marino * is checked. _It is a tristate variable corresponding to: 64*e4b17023SJohn Marino * 65*e4b17023SJohn Marino * a. force_sequential, meaning the sequential algorithm is executed. 66*e4b17023SJohn Marino * b. force_parallel, meaning the parallel algorithm is executed. 67*e4b17023SJohn Marino * c. heuristic 68*e4b17023SJohn Marino * 69*e4b17023SJohn Marino * For heuristic, the parallel algorithm implementation is called 70*e4b17023SJohn Marino * only if the input size is sufficiently large. For most 71*e4b17023SJohn Marino * algorithms, the input size is the (combined) length of the input 72*e4b17023SJohn Marino * sequence(__s). The threshold can be set by the user, individually 73*e4b17023SJohn Marino * for each algorithm. The according variables are called 74*e4b17023SJohn Marino * gnu_parallel::_Settings::[algorithm]_minimal_n . 75*e4b17023SJohn Marino * 76*e4b17023SJohn Marino * For some of the algorithms, there are even more tuning options, 77*e4b17023SJohn Marino * e. g. the ability to choose from multiple algorithm variants. See 78*e4b17023SJohn Marino * below for details. 79*e4b17023SJohn Marino */ 80*e4b17023SJohn Marino 81*e4b17023SJohn Marino // Written by Johannes Singler and Felix Putze. 82*e4b17023SJohn Marino 83*e4b17023SJohn Marino #ifndef _GLIBCXX_PARALLEL_SETTINGS_H 84*e4b17023SJohn Marino #define _GLIBCXX_PARALLEL_SETTINGS_H 1 85*e4b17023SJohn Marino 86*e4b17023SJohn Marino #include <parallel/types.h> 87*e4b17023SJohn Marino 88*e4b17023SJohn Marino /** 89*e4b17023SJohn Marino * @brief Determine at compile(?)-time if the parallel variant of an 90*e4b17023SJohn Marino * algorithm should be called. 91*e4b17023SJohn Marino * @param __c A condition that is convertible to bool that is overruled by 92*e4b17023SJohn Marino * __gnu_parallel::_Settings::algorithm_strategy. Usually a decision 93*e4b17023SJohn Marino * based on the input size. 94*e4b17023SJohn Marino */ 95*e4b17023SJohn Marino #define _GLIBCXX_PARALLEL_CONDITION(__c) \ 96*e4b17023SJohn Marino (__gnu_parallel::_Settings::get().algorithm_strategy \ 97*e4b17023SJohn Marino != __gnu_parallel::force_sequential \ 98*e4b17023SJohn Marino && ((__gnu_parallel::__get_max_threads() > 1 && (__c)) \ 99*e4b17023SJohn Marino || __gnu_parallel::_Settings::get().algorithm_strategy \ 100*e4b17023SJohn Marino == __gnu_parallel::force_parallel)) 101*e4b17023SJohn Marino 102*e4b17023SJohn Marino /* 103*e4b17023SJohn Marino inline bool 104*e4b17023SJohn Marino parallel_condition(bool __c) 105*e4b17023SJohn Marino { 106*e4b17023SJohn Marino bool ret = false; 107*e4b17023SJohn Marino const _Settings& __s = _Settings::get(); 108*e4b17023SJohn Marino if (__s.algorithm_strategy != force_seqential) 109*e4b17023SJohn Marino { 110*e4b17023SJohn Marino if (__s.algorithm_strategy == force_parallel) 111*e4b17023SJohn Marino ret = true; 112*e4b17023SJohn Marino else 113*e4b17023SJohn Marino ret = __get_max_threads() > 1 && __c; 114*e4b17023SJohn Marino } 115*e4b17023SJohn Marino return ret; 116*e4b17023SJohn Marino } 117*e4b17023SJohn Marino */ 118*e4b17023SJohn Marino 119*e4b17023SJohn Marino namespace __gnu_parallel 120*e4b17023SJohn Marino { 121*e4b17023SJohn Marino /// class _Settings 122*e4b17023SJohn Marino /// Run-time settings for the parallel mode including all tunable parameters. 123*e4b17023SJohn Marino struct _Settings 124*e4b17023SJohn Marino { 125*e4b17023SJohn Marino _AlgorithmStrategy algorithm_strategy; 126*e4b17023SJohn Marino 127*e4b17023SJohn Marino _SortAlgorithm sort_algorithm; 128*e4b17023SJohn Marino _PartialSumAlgorithm partial_sum_algorithm; 129*e4b17023SJohn Marino _MultiwayMergeAlgorithm multiway_merge_algorithm; 130*e4b17023SJohn Marino _FindAlgorithm find_algorithm; 131*e4b17023SJohn Marino 132*e4b17023SJohn Marino _SplittingAlgorithm sort_splitting; 133*e4b17023SJohn Marino _SplittingAlgorithm merge_splitting; 134*e4b17023SJohn Marino _SplittingAlgorithm multiway_merge_splitting; 135*e4b17023SJohn Marino 136*e4b17023SJohn Marino // Per-algorithm settings. 137*e4b17023SJohn Marino 138*e4b17023SJohn Marino /// Minimal input size for accumulate. 139*e4b17023SJohn Marino _SequenceIndex accumulate_minimal_n; 140*e4b17023SJohn Marino 141*e4b17023SJohn Marino /// Minimal input size for adjacent_difference. 142*e4b17023SJohn Marino unsigned int adjacent_difference_minimal_n; 143*e4b17023SJohn Marino 144*e4b17023SJohn Marino /// Minimal input size for count and count_if. 145*e4b17023SJohn Marino _SequenceIndex count_minimal_n; 146*e4b17023SJohn Marino 147*e4b17023SJohn Marino /// Minimal input size for fill. 148*e4b17023SJohn Marino _SequenceIndex fill_minimal_n; 149*e4b17023SJohn Marino 150*e4b17023SJohn Marino /// Block size increase factor for find. 151*e4b17023SJohn Marino double find_increasing_factor; 152*e4b17023SJohn Marino 153*e4b17023SJohn Marino /// Initial block size for find. 154*e4b17023SJohn Marino _SequenceIndex find_initial_block_size; 155*e4b17023SJohn Marino 156*e4b17023SJohn Marino /// Maximal block size for find. 157*e4b17023SJohn Marino _SequenceIndex find_maximum_block_size; 158*e4b17023SJohn Marino 159*e4b17023SJohn Marino /// Start with looking for this many elements sequentially, for find. 160*e4b17023SJohn Marino _SequenceIndex find_sequential_search_size; 161*e4b17023SJohn Marino 162*e4b17023SJohn Marino /// Minimal input size for for_each. 163*e4b17023SJohn Marino _SequenceIndex for_each_minimal_n; 164*e4b17023SJohn Marino 165*e4b17023SJohn Marino /// Minimal input size for generate. 166*e4b17023SJohn Marino _SequenceIndex generate_minimal_n; 167*e4b17023SJohn Marino 168*e4b17023SJohn Marino /// Minimal input size for max_element. 169*e4b17023SJohn Marino _SequenceIndex max_element_minimal_n; 170*e4b17023SJohn Marino 171*e4b17023SJohn Marino /// Minimal input size for merge. 172*e4b17023SJohn Marino _SequenceIndex merge_minimal_n; 173*e4b17023SJohn Marino 174*e4b17023SJohn Marino /// Oversampling factor for merge. 175*e4b17023SJohn Marino unsigned int merge_oversampling; 176*e4b17023SJohn Marino 177*e4b17023SJohn Marino /// Minimal input size for min_element. 178*e4b17023SJohn Marino _SequenceIndex min_element_minimal_n; 179*e4b17023SJohn Marino 180*e4b17023SJohn Marino /// Minimal input size for multiway_merge. 181*e4b17023SJohn Marino _SequenceIndex multiway_merge_minimal_n; 182*e4b17023SJohn Marino 183*e4b17023SJohn Marino /// Oversampling factor for multiway_merge. 184*e4b17023SJohn Marino int multiway_merge_minimal_k; 185*e4b17023SJohn Marino 186*e4b17023SJohn Marino /// Oversampling factor for multiway_merge. 187*e4b17023SJohn Marino unsigned int multiway_merge_oversampling; 188*e4b17023SJohn Marino 189*e4b17023SJohn Marino /// Minimal input size for nth_element. 190*e4b17023SJohn Marino _SequenceIndex nth_element_minimal_n; 191*e4b17023SJohn Marino 192*e4b17023SJohn Marino /// Chunk size for partition. 193*e4b17023SJohn Marino _SequenceIndex partition_chunk_size; 194*e4b17023SJohn Marino 195*e4b17023SJohn Marino /// Chunk size for partition, relative to input size. If > 0.0, 196*e4b17023SJohn Marino /// this value overrides partition_chunk_size. 197*e4b17023SJohn Marino double partition_chunk_share; 198*e4b17023SJohn Marino 199*e4b17023SJohn Marino /// Minimal input size for partition. 200*e4b17023SJohn Marino _SequenceIndex partition_minimal_n; 201*e4b17023SJohn Marino 202*e4b17023SJohn Marino /// Minimal input size for partial_sort. 203*e4b17023SJohn Marino _SequenceIndex partial_sort_minimal_n; 204*e4b17023SJohn Marino 205*e4b17023SJohn Marino /// Ratio for partial_sum. Assume "sum and write result" to be 206*e4b17023SJohn Marino /// this factor slower than just "sum". 207*e4b17023SJohn Marino float partial_sum_dilation; 208*e4b17023SJohn Marino 209*e4b17023SJohn Marino /// Minimal input size for partial_sum. 210*e4b17023SJohn Marino unsigned int partial_sum_minimal_n; 211*e4b17023SJohn Marino 212*e4b17023SJohn Marino /// Minimal input size for random_shuffle. 213*e4b17023SJohn Marino unsigned int random_shuffle_minimal_n; 214*e4b17023SJohn Marino 215*e4b17023SJohn Marino /// Minimal input size for replace and replace_if. 216*e4b17023SJohn Marino _SequenceIndex replace_minimal_n; 217*e4b17023SJohn Marino 218*e4b17023SJohn Marino /// Minimal input size for set_difference. 219*e4b17023SJohn Marino _SequenceIndex set_difference_minimal_n; 220*e4b17023SJohn Marino 221*e4b17023SJohn Marino /// Minimal input size for set_intersection. 222*e4b17023SJohn Marino _SequenceIndex set_intersection_minimal_n; 223*e4b17023SJohn Marino 224*e4b17023SJohn Marino /// Minimal input size for set_symmetric_difference. 225*e4b17023SJohn Marino _SequenceIndex set_symmetric_difference_minimal_n; 226*e4b17023SJohn Marino 227*e4b17023SJohn Marino /// Minimal input size for set_union. 228*e4b17023SJohn Marino _SequenceIndex set_union_minimal_n; 229*e4b17023SJohn Marino 230*e4b17023SJohn Marino /// Minimal input size for parallel sorting. 231*e4b17023SJohn Marino _SequenceIndex sort_minimal_n; 232*e4b17023SJohn Marino 233*e4b17023SJohn Marino /// Oversampling factor for parallel std::sort (MWMS). 234*e4b17023SJohn Marino unsigned int sort_mwms_oversampling; 235*e4b17023SJohn Marino 236*e4b17023SJohn Marino /// Such many samples to take to find a good pivot (quicksort). 237*e4b17023SJohn Marino unsigned int sort_qs_num_samples_preset; 238*e4b17023SJohn Marino 239*e4b17023SJohn Marino /// Maximal subsequence __length to switch to unbalanced __base case. 240*e4b17023SJohn Marino /// Applies to std::sort with dynamically load-balanced quicksort. 241*e4b17023SJohn Marino _SequenceIndex sort_qsb_base_case_maximal_n; 242*e4b17023SJohn Marino 243*e4b17023SJohn Marino /// Minimal input size for parallel std::transform. 244*e4b17023SJohn Marino _SequenceIndex transform_minimal_n; 245*e4b17023SJohn Marino 246*e4b17023SJohn Marino /// Minimal input size for unique_copy. 247*e4b17023SJohn Marino _SequenceIndex unique_copy_minimal_n; 248*e4b17023SJohn Marino 249*e4b17023SJohn Marino _SequenceIndex workstealing_chunk_size; 250*e4b17023SJohn Marino 251*e4b17023SJohn Marino // Hardware dependent tuning parameters. 252*e4b17023SJohn Marino 253*e4b17023SJohn Marino /// size of the L1 cache in bytes (underestimation). 254*e4b17023SJohn Marino unsigned long long L1_cache_size; 255*e4b17023SJohn Marino 256*e4b17023SJohn Marino /// size of the L2 cache in bytes (underestimation). 257*e4b17023SJohn Marino unsigned long long L2_cache_size; 258*e4b17023SJohn Marino 259*e4b17023SJohn Marino /// size of the Translation Lookaside Buffer (underestimation). 260*e4b17023SJohn Marino unsigned int TLB_size; 261*e4b17023SJohn Marino 262*e4b17023SJohn Marino /// Overestimation of cache line size. Used to avoid false 263*e4b17023SJohn Marino /// sharing, i.e. elements of different threads are at least this 264*e4b17023SJohn Marino /// amount apart. 265*e4b17023SJohn Marino unsigned int cache_line_size; 266*e4b17023SJohn Marino 267*e4b17023SJohn Marino // Statistics. 268*e4b17023SJohn Marino 269*e4b17023SJohn Marino /// The number of stolen ranges in load-balanced quicksort. 270*e4b17023SJohn Marino _SequenceIndex qsb_steals; 271*e4b17023SJohn Marino 272*e4b17023SJohn Marino /// Minimal input size for search and search_n. 273*e4b17023SJohn Marino _SequenceIndex search_minimal_n; 274*e4b17023SJohn Marino 275*e4b17023SJohn Marino /// Block size scale-down factor with respect to current position. 276*e4b17023SJohn Marino float find_scale_factor; 277*e4b17023SJohn Marino 278*e4b17023SJohn Marino /// Get the global settings. 279*e4b17023SJohn Marino _GLIBCXX_CONST static const _Settings& 280*e4b17023SJohn Marino get() throw(); 281*e4b17023SJohn Marino 282*e4b17023SJohn Marino /// Set the global settings. 283*e4b17023SJohn Marino static void 284*e4b17023SJohn Marino set(_Settings&) throw(); 285*e4b17023SJohn Marino 286*e4b17023SJohn Marino explicit _Settings_Settings287*e4b17023SJohn Marino _Settings() : 288*e4b17023SJohn Marino algorithm_strategy(heuristic), 289*e4b17023SJohn Marino sort_algorithm(MWMS), 290*e4b17023SJohn Marino partial_sum_algorithm(LINEAR), 291*e4b17023SJohn Marino multiway_merge_algorithm(LOSER_TREE), 292*e4b17023SJohn Marino find_algorithm(CONSTANT_SIZE_BLOCKS), 293*e4b17023SJohn Marino sort_splitting(EXACT), 294*e4b17023SJohn Marino merge_splitting(EXACT), 295*e4b17023SJohn Marino multiway_merge_splitting(EXACT), 296*e4b17023SJohn Marino accumulate_minimal_n(1000), 297*e4b17023SJohn Marino adjacent_difference_minimal_n(1000), 298*e4b17023SJohn Marino count_minimal_n(1000), 299*e4b17023SJohn Marino fill_minimal_n(1000), 300*e4b17023SJohn Marino find_increasing_factor(2.0), 301*e4b17023SJohn Marino find_initial_block_size(256), 302*e4b17023SJohn Marino find_maximum_block_size(8192), 303*e4b17023SJohn Marino find_sequential_search_size(256), 304*e4b17023SJohn Marino for_each_minimal_n(1000), 305*e4b17023SJohn Marino generate_minimal_n(1000), 306*e4b17023SJohn Marino max_element_minimal_n(1000), 307*e4b17023SJohn Marino merge_minimal_n(1000), 308*e4b17023SJohn Marino merge_oversampling(10), 309*e4b17023SJohn Marino min_element_minimal_n(1000), 310*e4b17023SJohn Marino multiway_merge_minimal_n(1000), 311*e4b17023SJohn Marino multiway_merge_minimal_k(2), multiway_merge_oversampling(10), 312*e4b17023SJohn Marino nth_element_minimal_n(1000), 313*e4b17023SJohn Marino partition_chunk_size(1000), 314*e4b17023SJohn Marino partition_chunk_share(0.0), 315*e4b17023SJohn Marino partition_minimal_n(1000), 316*e4b17023SJohn Marino partial_sort_minimal_n(1000), 317*e4b17023SJohn Marino partial_sum_dilation(1.0f), 318*e4b17023SJohn Marino partial_sum_minimal_n(1000), 319*e4b17023SJohn Marino random_shuffle_minimal_n(1000), 320*e4b17023SJohn Marino replace_minimal_n(1000), 321*e4b17023SJohn Marino set_difference_minimal_n(1000), 322*e4b17023SJohn Marino set_intersection_minimal_n(1000), 323*e4b17023SJohn Marino set_symmetric_difference_minimal_n(1000), 324*e4b17023SJohn Marino set_union_minimal_n(1000), 325*e4b17023SJohn Marino sort_minimal_n(1000), 326*e4b17023SJohn Marino sort_mwms_oversampling(10), 327*e4b17023SJohn Marino sort_qs_num_samples_preset(100), 328*e4b17023SJohn Marino sort_qsb_base_case_maximal_n(100), 329*e4b17023SJohn Marino transform_minimal_n(1000), 330*e4b17023SJohn Marino unique_copy_minimal_n(10000), 331*e4b17023SJohn Marino workstealing_chunk_size(100), 332*e4b17023SJohn Marino L1_cache_size(16 << 10), 333*e4b17023SJohn Marino L2_cache_size(256 << 10), 334*e4b17023SJohn Marino TLB_size(128), 335*e4b17023SJohn Marino cache_line_size(64), 336*e4b17023SJohn Marino qsb_steals(0), 337*e4b17023SJohn Marino search_minimal_n(1000), 338*e4b17023SJohn Marino find_scale_factor(0.01f) 339*e4b17023SJohn Marino { } 340*e4b17023SJohn Marino }; 341*e4b17023SJohn Marino } 342*e4b17023SJohn Marino 343*e4b17023SJohn Marino #endif /* _GLIBCXX_PARALLEL_SETTINGS_H */ 344