1*e4b17023SJohn Marino /* Array prefetching. 2*e4b17023SJohn Marino Copyright (C) 2005, 2007, 2008, 2009, 2010, 2011 3*e4b17023SJohn Marino Free Software Foundation, Inc. 4*e4b17023SJohn Marino 5*e4b17023SJohn Marino This file is part of GCC. 6*e4b17023SJohn Marino 7*e4b17023SJohn Marino GCC is free software; you can redistribute it and/or modify it 8*e4b17023SJohn Marino under the terms of the GNU General Public License as published by the 9*e4b17023SJohn Marino Free Software Foundation; either version 3, or (at your option) any 10*e4b17023SJohn Marino later version. 11*e4b17023SJohn Marino 12*e4b17023SJohn Marino GCC is distributed in the hope that it will be useful, but WITHOUT 13*e4b17023SJohn Marino ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14*e4b17023SJohn Marino FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15*e4b17023SJohn Marino for more details. 16*e4b17023SJohn Marino 17*e4b17023SJohn Marino You should have received a copy of the GNU General Public License 18*e4b17023SJohn Marino along with GCC; see the file COPYING3. If not see 19*e4b17023SJohn Marino <http://www.gnu.org/licenses/>. */ 20*e4b17023SJohn Marino 21*e4b17023SJohn Marino #include "config.h" 22*e4b17023SJohn Marino #include "system.h" 23*e4b17023SJohn Marino #include "coretypes.h" 24*e4b17023SJohn Marino #include "tm.h" 25*e4b17023SJohn Marino #include "tree.h" 26*e4b17023SJohn Marino #include "tm_p.h" 27*e4b17023SJohn Marino #include "basic-block.h" 28*e4b17023SJohn Marino #include "output.h" 29*e4b17023SJohn Marino #include "tree-pretty-print.h" 30*e4b17023SJohn Marino #include "tree-flow.h" 31*e4b17023SJohn Marino #include "tree-dump.h" 32*e4b17023SJohn Marino #include "timevar.h" 33*e4b17023SJohn Marino #include "cfgloop.h" 34*e4b17023SJohn Marino #include "tree-pass.h" 35*e4b17023SJohn Marino #include "insn-config.h" 36*e4b17023SJohn Marino #include "recog.h" 37*e4b17023SJohn Marino #include "hashtab.h" 38*e4b17023SJohn Marino #include "tree-chrec.h" 39*e4b17023SJohn Marino #include "tree-scalar-evolution.h" 40*e4b17023SJohn Marino #include "diagnostic-core.h" 41*e4b17023SJohn Marino #include "params.h" 42*e4b17023SJohn Marino #include "langhooks.h" 43*e4b17023SJohn Marino #include "tree-inline.h" 44*e4b17023SJohn Marino #include "tree-data-ref.h" 45*e4b17023SJohn Marino 46*e4b17023SJohn Marino 47*e4b17023SJohn Marino /* FIXME: Needed for optabs, but this should all be moved to a TBD interface 48*e4b17023SJohn Marino between the GIMPLE and RTL worlds. */ 49*e4b17023SJohn Marino #include "expr.h" 50*e4b17023SJohn Marino #include "optabs.h" 51*e4b17023SJohn Marino 52*e4b17023SJohn Marino /* This pass inserts prefetch instructions to optimize cache usage during 53*e4b17023SJohn Marino accesses to arrays in loops. It processes loops sequentially and: 54*e4b17023SJohn Marino 55*e4b17023SJohn Marino 1) Gathers all memory references in the single loop. 56*e4b17023SJohn Marino 2) For each of the references it decides when it is profitable to prefetch 57*e4b17023SJohn Marino it. To do it, we evaluate the reuse among the accesses, and determines 58*e4b17023SJohn Marino two values: PREFETCH_BEFORE (meaning that it only makes sense to do 59*e4b17023SJohn Marino prefetching in the first PREFETCH_BEFORE iterations of the loop) and 60*e4b17023SJohn Marino PREFETCH_MOD (meaning that it only makes sense to prefetch in the 61*e4b17023SJohn Marino iterations of the loop that are zero modulo PREFETCH_MOD). For example 62*e4b17023SJohn Marino (assuming cache line size is 64 bytes, char has size 1 byte and there 63*e4b17023SJohn Marino is no hardware sequential prefetch): 64*e4b17023SJohn Marino 65*e4b17023SJohn Marino char *a; 66*e4b17023SJohn Marino for (i = 0; i < max; i++) 67*e4b17023SJohn Marino { 68*e4b17023SJohn Marino a[255] = ...; (0) 69*e4b17023SJohn Marino a[i] = ...; (1) 70*e4b17023SJohn Marino a[i + 64] = ...; (2) 71*e4b17023SJohn Marino a[16*i] = ...; (3) 72*e4b17023SJohn Marino a[187*i] = ...; (4) 73*e4b17023SJohn Marino a[187*i + 50] = ...; (5) 74*e4b17023SJohn Marino } 75*e4b17023SJohn Marino 76*e4b17023SJohn Marino (0) obviously has PREFETCH_BEFORE 1 77*e4b17023SJohn Marino (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory 78*e4b17023SJohn Marino location 64 iterations before it, and PREFETCH_MOD 64 (since 79*e4b17023SJohn Marino it hits the same cache line otherwise). 80*e4b17023SJohn Marino (2) has PREFETCH_MOD 64 81*e4b17023SJohn Marino (3) has PREFETCH_MOD 4 82*e4b17023SJohn Marino (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 83*e4b17023SJohn Marino the cache line accessed by (5) is the same with probability only 84*e4b17023SJohn Marino 7/32. 85*e4b17023SJohn Marino (5) has PREFETCH_MOD 1 as well. 86*e4b17023SJohn Marino 87*e4b17023SJohn Marino Additionally, we use data dependence analysis to determine for each 88*e4b17023SJohn Marino reference the distance till the first reuse; this information is used 89*e4b17023SJohn Marino to determine the temporality of the issued prefetch instruction. 90*e4b17023SJohn Marino 91*e4b17023SJohn Marino 3) We determine how much ahead we need to prefetch. The number of 92*e4b17023SJohn Marino iterations needed is time to fetch / time spent in one iteration of 93*e4b17023SJohn Marino the loop. The problem is that we do not know either of these values, 94*e4b17023SJohn Marino so we just make a heuristic guess based on a magic (possibly) 95*e4b17023SJohn Marino target-specific constant and size of the loop. 96*e4b17023SJohn Marino 97*e4b17023SJohn Marino 4) Determine which of the references we prefetch. We take into account 98*e4b17023SJohn Marino that there is a maximum number of simultaneous prefetches (provided 99*e4b17023SJohn Marino by machine description). We prefetch as many prefetches as possible 100*e4b17023SJohn Marino while still within this bound (starting with those with lowest 101*e4b17023SJohn Marino prefetch_mod, since they are responsible for most of the cache 102*e4b17023SJohn Marino misses). 103*e4b17023SJohn Marino 104*e4b17023SJohn Marino 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD 105*e4b17023SJohn Marino and PREFETCH_BEFORE requirements (within some bounds), and to avoid 106*e4b17023SJohn Marino prefetching nonaccessed memory. 107*e4b17023SJohn Marino TODO -- actually implement peeling. 108*e4b17023SJohn Marino 109*e4b17023SJohn Marino 6) We actually emit the prefetch instructions. ??? Perhaps emit the 110*e4b17023SJohn Marino prefetch instructions with guards in cases where 5) was not sufficient 111*e4b17023SJohn Marino to satisfy the constraints? 112*e4b17023SJohn Marino 113*e4b17023SJohn Marino A cost model is implemented to determine whether or not prefetching is 114*e4b17023SJohn Marino profitable for a given loop. The cost model has three heuristics: 115*e4b17023SJohn Marino 116*e4b17023SJohn Marino 1. Function trip_count_to_ahead_ratio_too_small_p implements a 117*e4b17023SJohn Marino heuristic that determines whether or not the loop has too few 118*e4b17023SJohn Marino iterations (compared to ahead). Prefetching is not likely to be 119*e4b17023SJohn Marino beneficial if the trip count to ahead ratio is below a certain 120*e4b17023SJohn Marino minimum. 121*e4b17023SJohn Marino 122*e4b17023SJohn Marino 2. Function mem_ref_count_reasonable_p implements a heuristic that 123*e4b17023SJohn Marino determines whether the given loop has enough CPU ops that can be 124*e4b17023SJohn Marino overlapped with cache missing memory ops. If not, the loop 125*e4b17023SJohn Marino won't benefit from prefetching. In the implementation, 126*e4b17023SJohn Marino prefetching is not considered beneficial if the ratio between 127*e4b17023SJohn Marino the instruction count and the mem ref count is below a certain 128*e4b17023SJohn Marino minimum. 129*e4b17023SJohn Marino 130*e4b17023SJohn Marino 3. Function insn_to_prefetch_ratio_too_small_p implements a 131*e4b17023SJohn Marino heuristic that disables prefetching in a loop if the prefetching 132*e4b17023SJohn Marino cost is above a certain limit. The relative prefetching cost is 133*e4b17023SJohn Marino estimated by taking the ratio between the prefetch count and the 134*e4b17023SJohn Marino total intruction count (this models the I-cache cost). 135*e4b17023SJohn Marino 136*e4b17023SJohn Marino The limits used in these heuristics are defined as parameters with 137*e4b17023SJohn Marino reasonable default values. Machine-specific default values will be 138*e4b17023SJohn Marino added later. 139*e4b17023SJohn Marino 140*e4b17023SJohn Marino Some other TODO: 141*e4b17023SJohn Marino -- write and use more general reuse analysis (that could be also used 142*e4b17023SJohn Marino in other cache aimed loop optimizations) 143*e4b17023SJohn Marino -- make it behave sanely together with the prefetches given by user 144*e4b17023SJohn Marino (now we just ignore them; at the very least we should avoid 145*e4b17023SJohn Marino optimizing loops in that user put his own prefetches) 146*e4b17023SJohn Marino -- we assume cache line size alignment of arrays; this could be 147*e4b17023SJohn Marino improved. */ 148*e4b17023SJohn Marino 149*e4b17023SJohn Marino /* Magic constants follow. These should be replaced by machine specific 150*e4b17023SJohn Marino numbers. */ 151*e4b17023SJohn Marino 152*e4b17023SJohn Marino /* True if write can be prefetched by a read prefetch. */ 153*e4b17023SJohn Marino 154*e4b17023SJohn Marino #ifndef WRITE_CAN_USE_READ_PREFETCH 155*e4b17023SJohn Marino #define WRITE_CAN_USE_READ_PREFETCH 1 156*e4b17023SJohn Marino #endif 157*e4b17023SJohn Marino 158*e4b17023SJohn Marino /* True if read can be prefetched by a write prefetch. */ 159*e4b17023SJohn Marino 160*e4b17023SJohn Marino #ifndef READ_CAN_USE_WRITE_PREFETCH 161*e4b17023SJohn Marino #define READ_CAN_USE_WRITE_PREFETCH 0 162*e4b17023SJohn Marino #endif 163*e4b17023SJohn Marino 164*e4b17023SJohn Marino /* The size of the block loaded by a single prefetch. Usually, this is 165*e4b17023SJohn Marino the same as cache line size (at the moment, we only consider one level 166*e4b17023SJohn Marino of cache hierarchy). */ 167*e4b17023SJohn Marino 168*e4b17023SJohn Marino #ifndef PREFETCH_BLOCK 169*e4b17023SJohn Marino #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE 170*e4b17023SJohn Marino #endif 171*e4b17023SJohn Marino 172*e4b17023SJohn Marino /* Do we have a forward hardware sequential prefetching? */ 173*e4b17023SJohn Marino 174*e4b17023SJohn Marino #ifndef HAVE_FORWARD_PREFETCH 175*e4b17023SJohn Marino #define HAVE_FORWARD_PREFETCH 0 176*e4b17023SJohn Marino #endif 177*e4b17023SJohn Marino 178*e4b17023SJohn Marino /* Do we have a backward hardware sequential prefetching? */ 179*e4b17023SJohn Marino 180*e4b17023SJohn Marino #ifndef HAVE_BACKWARD_PREFETCH 181*e4b17023SJohn Marino #define HAVE_BACKWARD_PREFETCH 0 182*e4b17023SJohn Marino #endif 183*e4b17023SJohn Marino 184*e4b17023SJohn Marino /* In some cases we are only able to determine that there is a certain 185*e4b17023SJohn Marino probability that the two accesses hit the same cache line. In this 186*e4b17023SJohn Marino case, we issue the prefetches for both of them if this probability 187*e4b17023SJohn Marino is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */ 188*e4b17023SJohn Marino 189*e4b17023SJohn Marino #ifndef ACCEPTABLE_MISS_RATE 190*e4b17023SJohn Marino #define ACCEPTABLE_MISS_RATE 50 191*e4b17023SJohn Marino #endif 192*e4b17023SJohn Marino 193*e4b17023SJohn Marino #ifndef HAVE_prefetch 194*e4b17023SJohn Marino #define HAVE_prefetch 0 195*e4b17023SJohn Marino #endif 196*e4b17023SJohn Marino 197*e4b17023SJohn Marino #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024)) 198*e4b17023SJohn Marino #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024)) 199*e4b17023SJohn Marino 200*e4b17023SJohn Marino /* We consider a memory access nontemporal if it is not reused sooner than 201*e4b17023SJohn Marino after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore 202*e4b17023SJohn Marino accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 203*e4b17023SJohn Marino so that we use nontemporal prefetches e.g. if single memory location 204*e4b17023SJohn Marino is accessed several times in a single iteration of the loop. */ 205*e4b17023SJohn Marino #define NONTEMPORAL_FRACTION 16 206*e4b17023SJohn Marino 207*e4b17023SJohn Marino /* In case we have to emit a memory fence instruction after the loop that 208*e4b17023SJohn Marino uses nontemporal stores, this defines the builtin to use. */ 209*e4b17023SJohn Marino 210*e4b17023SJohn Marino #ifndef FENCE_FOLLOWING_MOVNT 211*e4b17023SJohn Marino #define FENCE_FOLLOWING_MOVNT NULL_TREE 212*e4b17023SJohn Marino #endif 213*e4b17023SJohn Marino 214*e4b17023SJohn Marino /* It is not profitable to prefetch when the trip count is not at 215*e4b17023SJohn Marino least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance. 216*e4b17023SJohn Marino For example, in a loop with a prefetch ahead distance of 10, 217*e4b17023SJohn Marino supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is 218*e4b17023SJohn Marino profitable to prefetch when the trip count is greater or equal to 219*e4b17023SJohn Marino 40. In that case, 30 out of the 40 iterations will benefit from 220*e4b17023SJohn Marino prefetching. */ 221*e4b17023SJohn Marino 222*e4b17023SJohn Marino #ifndef TRIP_COUNT_TO_AHEAD_RATIO 223*e4b17023SJohn Marino #define TRIP_COUNT_TO_AHEAD_RATIO 4 224*e4b17023SJohn Marino #endif 225*e4b17023SJohn Marino 226*e4b17023SJohn Marino /* The group of references between that reuse may occur. */ 227*e4b17023SJohn Marino 228*e4b17023SJohn Marino struct mem_ref_group 229*e4b17023SJohn Marino { 230*e4b17023SJohn Marino tree base; /* Base of the reference. */ 231*e4b17023SJohn Marino tree step; /* Step of the reference. */ 232*e4b17023SJohn Marino struct mem_ref *refs; /* References in the group. */ 233*e4b17023SJohn Marino struct mem_ref_group *next; /* Next group of references. */ 234*e4b17023SJohn Marino }; 235*e4b17023SJohn Marino 236*e4b17023SJohn Marino /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 237*e4b17023SJohn Marino 238*e4b17023SJohn Marino #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 239*e4b17023SJohn Marino 240*e4b17023SJohn Marino /* Do not generate a prefetch if the unroll factor is significantly less 241*e4b17023SJohn Marino than what is required by the prefetch. This is to avoid redundant 242*e4b17023SJohn Marino prefetches. For example, when prefetch_mod is 16 and unroll_factor is 243*e4b17023SJohn Marino 2, prefetching requires unrolling the loop 16 times, but 244*e4b17023SJohn Marino the loop is actually unrolled twice. In this case (ratio = 8), 245*e4b17023SJohn Marino prefetching is not likely to be beneficial. */ 246*e4b17023SJohn Marino 247*e4b17023SJohn Marino #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 248*e4b17023SJohn Marino #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4 249*e4b17023SJohn Marino #endif 250*e4b17023SJohn Marino 251*e4b17023SJohn Marino /* Some of the prefetch computations have quadratic complexity. We want to 252*e4b17023SJohn Marino avoid huge compile times and, therefore, want to limit the amount of 253*e4b17023SJohn Marino memory references per loop where we consider prefetching. */ 254*e4b17023SJohn Marino 255*e4b17023SJohn Marino #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP 256*e4b17023SJohn Marino #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200 257*e4b17023SJohn Marino #endif 258*e4b17023SJohn Marino 259*e4b17023SJohn Marino /* The memory reference. */ 260*e4b17023SJohn Marino 261*e4b17023SJohn Marino struct mem_ref 262*e4b17023SJohn Marino { 263*e4b17023SJohn Marino gimple stmt; /* Statement in that the reference appears. */ 264*e4b17023SJohn Marino tree mem; /* The reference. */ 265*e4b17023SJohn Marino HOST_WIDE_INT delta; /* Constant offset of the reference. */ 266*e4b17023SJohn Marino struct mem_ref_group *group; /* The group of references it belongs to. */ 267*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_mod; 268*e4b17023SJohn Marino /* Prefetch only each PREFETCH_MOD-th 269*e4b17023SJohn Marino iteration. */ 270*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_before; 271*e4b17023SJohn Marino /* Prefetch only first PREFETCH_BEFORE 272*e4b17023SJohn Marino iterations. */ 273*e4b17023SJohn Marino unsigned reuse_distance; /* The amount of data accessed before the first 274*e4b17023SJohn Marino reuse of this value. */ 275*e4b17023SJohn Marino struct mem_ref *next; /* The next reference in the group. */ 276*e4b17023SJohn Marino unsigned write_p : 1; /* Is it a write? */ 277*e4b17023SJohn Marino unsigned independent_p : 1; /* True if the reference is independent on 278*e4b17023SJohn Marino all other references inside the loop. */ 279*e4b17023SJohn Marino unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */ 280*e4b17023SJohn Marino unsigned storent_p : 1; /* True if we changed the store to a 281*e4b17023SJohn Marino nontemporal one. */ 282*e4b17023SJohn Marino }; 283*e4b17023SJohn Marino 284*e4b17023SJohn Marino /* Dumps information about reference REF to FILE. */ 285*e4b17023SJohn Marino 286*e4b17023SJohn Marino static void 287*e4b17023SJohn Marino dump_mem_ref (FILE *file, struct mem_ref *ref) 288*e4b17023SJohn Marino { 289*e4b17023SJohn Marino fprintf (file, "Reference %p:\n", (void *) ref); 290*e4b17023SJohn Marino 291*e4b17023SJohn Marino fprintf (file, " group %p (base ", (void *) ref->group); 292*e4b17023SJohn Marino print_generic_expr (file, ref->group->base, TDF_SLIM); 293*e4b17023SJohn Marino fprintf (file, ", step "); 294*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ref->group->step)) 295*e4b17023SJohn Marino fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step)); 296*e4b17023SJohn Marino else 297*e4b17023SJohn Marino print_generic_expr (file, ref->group->step, TDF_TREE); 298*e4b17023SJohn Marino fprintf (file, ")\n"); 299*e4b17023SJohn Marino 300*e4b17023SJohn Marino fprintf (file, " delta "); 301*e4b17023SJohn Marino fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); 302*e4b17023SJohn Marino fprintf (file, "\n"); 303*e4b17023SJohn Marino 304*e4b17023SJohn Marino fprintf (file, " %s\n", ref->write_p ? "write" : "read"); 305*e4b17023SJohn Marino 306*e4b17023SJohn Marino fprintf (file, "\n"); 307*e4b17023SJohn Marino } 308*e4b17023SJohn Marino 309*e4b17023SJohn Marino /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 310*e4b17023SJohn Marino exist. */ 311*e4b17023SJohn Marino 312*e4b17023SJohn Marino static struct mem_ref_group * 313*e4b17023SJohn Marino find_or_create_group (struct mem_ref_group **groups, tree base, tree step) 314*e4b17023SJohn Marino { 315*e4b17023SJohn Marino struct mem_ref_group *group; 316*e4b17023SJohn Marino 317*e4b17023SJohn Marino for (; *groups; groups = &(*groups)->next) 318*e4b17023SJohn Marino { 319*e4b17023SJohn Marino if (operand_equal_p ((*groups)->step, step, 0) 320*e4b17023SJohn Marino && operand_equal_p ((*groups)->base, base, 0)) 321*e4b17023SJohn Marino return *groups; 322*e4b17023SJohn Marino 323*e4b17023SJohn Marino /* If step is an integer constant, keep the list of groups sorted 324*e4b17023SJohn Marino by decreasing step. */ 325*e4b17023SJohn Marino if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step) 326*e4b17023SJohn Marino && int_cst_value ((*groups)->step) < int_cst_value (step)) 327*e4b17023SJohn Marino break; 328*e4b17023SJohn Marino } 329*e4b17023SJohn Marino 330*e4b17023SJohn Marino group = XNEW (struct mem_ref_group); 331*e4b17023SJohn Marino group->base = base; 332*e4b17023SJohn Marino group->step = step; 333*e4b17023SJohn Marino group->refs = NULL; 334*e4b17023SJohn Marino group->next = *groups; 335*e4b17023SJohn Marino *groups = group; 336*e4b17023SJohn Marino 337*e4b17023SJohn Marino return group; 338*e4b17023SJohn Marino } 339*e4b17023SJohn Marino 340*e4b17023SJohn Marino /* Records a memory reference MEM in GROUP with offset DELTA and write status 341*e4b17023SJohn Marino WRITE_P. The reference occurs in statement STMT. */ 342*e4b17023SJohn Marino 343*e4b17023SJohn Marino static void 344*e4b17023SJohn Marino record_ref (struct mem_ref_group *group, gimple stmt, tree mem, 345*e4b17023SJohn Marino HOST_WIDE_INT delta, bool write_p) 346*e4b17023SJohn Marino { 347*e4b17023SJohn Marino struct mem_ref **aref; 348*e4b17023SJohn Marino 349*e4b17023SJohn Marino /* Do not record the same address twice. */ 350*e4b17023SJohn Marino for (aref = &group->refs; *aref; aref = &(*aref)->next) 351*e4b17023SJohn Marino { 352*e4b17023SJohn Marino /* It does not have to be possible for write reference to reuse the read 353*e4b17023SJohn Marino prefetch, or vice versa. */ 354*e4b17023SJohn Marino if (!WRITE_CAN_USE_READ_PREFETCH 355*e4b17023SJohn Marino && write_p 356*e4b17023SJohn Marino && !(*aref)->write_p) 357*e4b17023SJohn Marino continue; 358*e4b17023SJohn Marino if (!READ_CAN_USE_WRITE_PREFETCH 359*e4b17023SJohn Marino && !write_p 360*e4b17023SJohn Marino && (*aref)->write_p) 361*e4b17023SJohn Marino continue; 362*e4b17023SJohn Marino 363*e4b17023SJohn Marino if ((*aref)->delta == delta) 364*e4b17023SJohn Marino return; 365*e4b17023SJohn Marino } 366*e4b17023SJohn Marino 367*e4b17023SJohn Marino (*aref) = XNEW (struct mem_ref); 368*e4b17023SJohn Marino (*aref)->stmt = stmt; 369*e4b17023SJohn Marino (*aref)->mem = mem; 370*e4b17023SJohn Marino (*aref)->delta = delta; 371*e4b17023SJohn Marino (*aref)->write_p = write_p; 372*e4b17023SJohn Marino (*aref)->prefetch_before = PREFETCH_ALL; 373*e4b17023SJohn Marino (*aref)->prefetch_mod = 1; 374*e4b17023SJohn Marino (*aref)->reuse_distance = 0; 375*e4b17023SJohn Marino (*aref)->issue_prefetch_p = false; 376*e4b17023SJohn Marino (*aref)->group = group; 377*e4b17023SJohn Marino (*aref)->next = NULL; 378*e4b17023SJohn Marino (*aref)->independent_p = false; 379*e4b17023SJohn Marino (*aref)->storent_p = false; 380*e4b17023SJohn Marino 381*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 382*e4b17023SJohn Marino dump_mem_ref (dump_file, *aref); 383*e4b17023SJohn Marino } 384*e4b17023SJohn Marino 385*e4b17023SJohn Marino /* Release memory references in GROUPS. */ 386*e4b17023SJohn Marino 387*e4b17023SJohn Marino static void 388*e4b17023SJohn Marino release_mem_refs (struct mem_ref_group *groups) 389*e4b17023SJohn Marino { 390*e4b17023SJohn Marino struct mem_ref_group *next_g; 391*e4b17023SJohn Marino struct mem_ref *ref, *next_r; 392*e4b17023SJohn Marino 393*e4b17023SJohn Marino for (; groups; groups = next_g) 394*e4b17023SJohn Marino { 395*e4b17023SJohn Marino next_g = groups->next; 396*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = next_r) 397*e4b17023SJohn Marino { 398*e4b17023SJohn Marino next_r = ref->next; 399*e4b17023SJohn Marino free (ref); 400*e4b17023SJohn Marino } 401*e4b17023SJohn Marino free (groups); 402*e4b17023SJohn Marino } 403*e4b17023SJohn Marino } 404*e4b17023SJohn Marino 405*e4b17023SJohn Marino /* A structure used to pass arguments to idx_analyze_ref. */ 406*e4b17023SJohn Marino 407*e4b17023SJohn Marino struct ar_data 408*e4b17023SJohn Marino { 409*e4b17023SJohn Marino struct loop *loop; /* Loop of the reference. */ 410*e4b17023SJohn Marino gimple stmt; /* Statement of the reference. */ 411*e4b17023SJohn Marino tree *step; /* Step of the memory reference. */ 412*e4b17023SJohn Marino HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 413*e4b17023SJohn Marino }; 414*e4b17023SJohn Marino 415*e4b17023SJohn Marino /* Analyzes a single INDEX of a memory reference to obtain information 416*e4b17023SJohn Marino described at analyze_ref. Callback for for_each_index. */ 417*e4b17023SJohn Marino 418*e4b17023SJohn Marino static bool 419*e4b17023SJohn Marino idx_analyze_ref (tree base, tree *index, void *data) 420*e4b17023SJohn Marino { 421*e4b17023SJohn Marino struct ar_data *ar_data = (struct ar_data *) data; 422*e4b17023SJohn Marino tree ibase, step, stepsize; 423*e4b17023SJohn Marino HOST_WIDE_INT idelta = 0, imult = 1; 424*e4b17023SJohn Marino affine_iv iv; 425*e4b17023SJohn Marino 426*e4b17023SJohn Marino if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), 427*e4b17023SJohn Marino *index, &iv, true)) 428*e4b17023SJohn Marino return false; 429*e4b17023SJohn Marino ibase = iv.base; 430*e4b17023SJohn Marino step = iv.step; 431*e4b17023SJohn Marino 432*e4b17023SJohn Marino if (TREE_CODE (ibase) == POINTER_PLUS_EXPR 433*e4b17023SJohn Marino && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 434*e4b17023SJohn Marino { 435*e4b17023SJohn Marino idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 436*e4b17023SJohn Marino ibase = TREE_OPERAND (ibase, 0); 437*e4b17023SJohn Marino } 438*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ibase)) 439*e4b17023SJohn Marino { 440*e4b17023SJohn Marino idelta += int_cst_value (ibase); 441*e4b17023SJohn Marino ibase = build_int_cst (TREE_TYPE (ibase), 0); 442*e4b17023SJohn Marino } 443*e4b17023SJohn Marino 444*e4b17023SJohn Marino if (TREE_CODE (base) == ARRAY_REF) 445*e4b17023SJohn Marino { 446*e4b17023SJohn Marino stepsize = array_ref_element_size (base); 447*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (stepsize)) 448*e4b17023SJohn Marino return false; 449*e4b17023SJohn Marino imult = int_cst_value (stepsize); 450*e4b17023SJohn Marino step = fold_build2 (MULT_EXPR, sizetype, 451*e4b17023SJohn Marino fold_convert (sizetype, step), 452*e4b17023SJohn Marino fold_convert (sizetype, stepsize)); 453*e4b17023SJohn Marino idelta *= imult; 454*e4b17023SJohn Marino } 455*e4b17023SJohn Marino 456*e4b17023SJohn Marino if (*ar_data->step == NULL_TREE) 457*e4b17023SJohn Marino *ar_data->step = step; 458*e4b17023SJohn Marino else 459*e4b17023SJohn Marino *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, 460*e4b17023SJohn Marino fold_convert (sizetype, *ar_data->step), 461*e4b17023SJohn Marino fold_convert (sizetype, step)); 462*e4b17023SJohn Marino *ar_data->delta += idelta; 463*e4b17023SJohn Marino *index = ibase; 464*e4b17023SJohn Marino 465*e4b17023SJohn Marino return true; 466*e4b17023SJohn Marino } 467*e4b17023SJohn Marino 468*e4b17023SJohn Marino /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and 469*e4b17023SJohn Marino STEP are integer constants and iter is number of iterations of LOOP. The 470*e4b17023SJohn Marino reference occurs in statement STMT. Strips nonaddressable component 471*e4b17023SJohn Marino references from REF_P. */ 472*e4b17023SJohn Marino 473*e4b17023SJohn Marino static bool 474*e4b17023SJohn Marino analyze_ref (struct loop *loop, tree *ref_p, tree *base, 475*e4b17023SJohn Marino tree *step, HOST_WIDE_INT *delta, 476*e4b17023SJohn Marino gimple stmt) 477*e4b17023SJohn Marino { 478*e4b17023SJohn Marino struct ar_data ar_data; 479*e4b17023SJohn Marino tree off; 480*e4b17023SJohn Marino HOST_WIDE_INT bit_offset; 481*e4b17023SJohn Marino tree ref = *ref_p; 482*e4b17023SJohn Marino 483*e4b17023SJohn Marino *step = NULL_TREE; 484*e4b17023SJohn Marino *delta = 0; 485*e4b17023SJohn Marino 486*e4b17023SJohn Marino /* First strip off the component references. Ignore bitfields. 487*e4b17023SJohn Marino Also strip off the real and imagine parts of a complex, so that 488*e4b17023SJohn Marino they can have the same base. */ 489*e4b17023SJohn Marino if (TREE_CODE (ref) == REALPART_EXPR 490*e4b17023SJohn Marino || TREE_CODE (ref) == IMAGPART_EXPR 491*e4b17023SJohn Marino || (TREE_CODE (ref) == COMPONENT_REF 492*e4b17023SJohn Marino && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))) 493*e4b17023SJohn Marino { 494*e4b17023SJohn Marino if (TREE_CODE (ref) == IMAGPART_EXPR) 495*e4b17023SJohn Marino *delta += int_size_in_bytes (TREE_TYPE (ref)); 496*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0); 497*e4b17023SJohn Marino } 498*e4b17023SJohn Marino 499*e4b17023SJohn Marino *ref_p = ref; 500*e4b17023SJohn Marino 501*e4b17023SJohn Marino for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 502*e4b17023SJohn Marino { 503*e4b17023SJohn Marino off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1)); 504*e4b17023SJohn Marino bit_offset = TREE_INT_CST_LOW (off); 505*e4b17023SJohn Marino gcc_assert (bit_offset % BITS_PER_UNIT == 0); 506*e4b17023SJohn Marino 507*e4b17023SJohn Marino *delta += bit_offset / BITS_PER_UNIT; 508*e4b17023SJohn Marino } 509*e4b17023SJohn Marino 510*e4b17023SJohn Marino *base = unshare_expr (ref); 511*e4b17023SJohn Marino ar_data.loop = loop; 512*e4b17023SJohn Marino ar_data.stmt = stmt; 513*e4b17023SJohn Marino ar_data.step = step; 514*e4b17023SJohn Marino ar_data.delta = delta; 515*e4b17023SJohn Marino return for_each_index (base, idx_analyze_ref, &ar_data); 516*e4b17023SJohn Marino } 517*e4b17023SJohn Marino 518*e4b17023SJohn Marino /* Record a memory reference REF to the list REFS. The reference occurs in 519*e4b17023SJohn Marino LOOP in statement STMT and it is write if WRITE_P. Returns true if the 520*e4b17023SJohn Marino reference was recorded, false otherwise. */ 521*e4b17023SJohn Marino 522*e4b17023SJohn Marino static bool 523*e4b17023SJohn Marino gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, 524*e4b17023SJohn Marino tree ref, bool write_p, gimple stmt) 525*e4b17023SJohn Marino { 526*e4b17023SJohn Marino tree base, step; 527*e4b17023SJohn Marino HOST_WIDE_INT delta; 528*e4b17023SJohn Marino struct mem_ref_group *agrp; 529*e4b17023SJohn Marino 530*e4b17023SJohn Marino if (get_base_address (ref) == NULL) 531*e4b17023SJohn Marino return false; 532*e4b17023SJohn Marino 533*e4b17023SJohn Marino if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 534*e4b17023SJohn Marino return false; 535*e4b17023SJohn Marino /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ 536*e4b17023SJohn Marino if (step == NULL_TREE) 537*e4b17023SJohn Marino return false; 538*e4b17023SJohn Marino 539*e4b17023SJohn Marino /* Stop if the address of BASE could not be taken. */ 540*e4b17023SJohn Marino if (may_be_nonaddressable_p (base)) 541*e4b17023SJohn Marino return false; 542*e4b17023SJohn Marino 543*e4b17023SJohn Marino /* Limit non-constant step prefetching only to the innermost loops. */ 544*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (step) && loop->inner != NULL) 545*e4b17023SJohn Marino return false; 546*e4b17023SJohn Marino 547*e4b17023SJohn Marino /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 548*e4b17023SJohn Marino are integer constants. */ 549*e4b17023SJohn Marino agrp = find_or_create_group (refs, base, step); 550*e4b17023SJohn Marino record_ref (agrp, stmt, ref, delta, write_p); 551*e4b17023SJohn Marino 552*e4b17023SJohn Marino return true; 553*e4b17023SJohn Marino } 554*e4b17023SJohn Marino 555*e4b17023SJohn Marino /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to 556*e4b17023SJohn Marino true if there are no other memory references inside the loop. */ 557*e4b17023SJohn Marino 558*e4b17023SJohn Marino static struct mem_ref_group * 559*e4b17023SJohn Marino gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count) 560*e4b17023SJohn Marino { 561*e4b17023SJohn Marino basic_block *body = get_loop_body_in_dom_order (loop); 562*e4b17023SJohn Marino basic_block bb; 563*e4b17023SJohn Marino unsigned i; 564*e4b17023SJohn Marino gimple_stmt_iterator bsi; 565*e4b17023SJohn Marino gimple stmt; 566*e4b17023SJohn Marino tree lhs, rhs; 567*e4b17023SJohn Marino struct mem_ref_group *refs = NULL; 568*e4b17023SJohn Marino 569*e4b17023SJohn Marino *no_other_refs = true; 570*e4b17023SJohn Marino *ref_count = 0; 571*e4b17023SJohn Marino 572*e4b17023SJohn Marino /* Scan the loop body in order, so that the former references precede the 573*e4b17023SJohn Marino later ones. */ 574*e4b17023SJohn Marino for (i = 0; i < loop->num_nodes; i++) 575*e4b17023SJohn Marino { 576*e4b17023SJohn Marino bb = body[i]; 577*e4b17023SJohn Marino if (bb->loop_father != loop) 578*e4b17023SJohn Marino continue; 579*e4b17023SJohn Marino 580*e4b17023SJohn Marino for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 581*e4b17023SJohn Marino { 582*e4b17023SJohn Marino stmt = gsi_stmt (bsi); 583*e4b17023SJohn Marino 584*e4b17023SJohn Marino if (gimple_code (stmt) != GIMPLE_ASSIGN) 585*e4b17023SJohn Marino { 586*e4b17023SJohn Marino if (gimple_vuse (stmt) 587*e4b17023SJohn Marino || (is_gimple_call (stmt) 588*e4b17023SJohn Marino && !(gimple_call_flags (stmt) & ECF_CONST))) 589*e4b17023SJohn Marino *no_other_refs = false; 590*e4b17023SJohn Marino continue; 591*e4b17023SJohn Marino } 592*e4b17023SJohn Marino 593*e4b17023SJohn Marino lhs = gimple_assign_lhs (stmt); 594*e4b17023SJohn Marino rhs = gimple_assign_rhs1 (stmt); 595*e4b17023SJohn Marino 596*e4b17023SJohn Marino if (REFERENCE_CLASS_P (rhs)) 597*e4b17023SJohn Marino { 598*e4b17023SJohn Marino *no_other_refs &= gather_memory_references_ref (loop, &refs, 599*e4b17023SJohn Marino rhs, false, stmt); 600*e4b17023SJohn Marino *ref_count += 1; 601*e4b17023SJohn Marino } 602*e4b17023SJohn Marino if (REFERENCE_CLASS_P (lhs)) 603*e4b17023SJohn Marino { 604*e4b17023SJohn Marino *no_other_refs &= gather_memory_references_ref (loop, &refs, 605*e4b17023SJohn Marino lhs, true, stmt); 606*e4b17023SJohn Marino *ref_count += 1; 607*e4b17023SJohn Marino } 608*e4b17023SJohn Marino } 609*e4b17023SJohn Marino } 610*e4b17023SJohn Marino free (body); 611*e4b17023SJohn Marino 612*e4b17023SJohn Marino return refs; 613*e4b17023SJohn Marino } 614*e4b17023SJohn Marino 615*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the self-reuse. */ 616*e4b17023SJohn Marino 617*e4b17023SJohn Marino static void 618*e4b17023SJohn Marino prune_ref_by_self_reuse (struct mem_ref *ref) 619*e4b17023SJohn Marino { 620*e4b17023SJohn Marino HOST_WIDE_INT step; 621*e4b17023SJohn Marino bool backward; 622*e4b17023SJohn Marino 623*e4b17023SJohn Marino /* If the step size is non constant, we cannot calculate prefetch_mod. */ 624*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (ref->group->step)) 625*e4b17023SJohn Marino return; 626*e4b17023SJohn Marino 627*e4b17023SJohn Marino step = int_cst_value (ref->group->step); 628*e4b17023SJohn Marino 629*e4b17023SJohn Marino backward = step < 0; 630*e4b17023SJohn Marino 631*e4b17023SJohn Marino if (step == 0) 632*e4b17023SJohn Marino { 633*e4b17023SJohn Marino /* Prefetch references to invariant address just once. */ 634*e4b17023SJohn Marino ref->prefetch_before = 1; 635*e4b17023SJohn Marino return; 636*e4b17023SJohn Marino } 637*e4b17023SJohn Marino 638*e4b17023SJohn Marino if (backward) 639*e4b17023SJohn Marino step = -step; 640*e4b17023SJohn Marino 641*e4b17023SJohn Marino if (step > PREFETCH_BLOCK) 642*e4b17023SJohn Marino return; 643*e4b17023SJohn Marino 644*e4b17023SJohn Marino if ((backward && HAVE_BACKWARD_PREFETCH) 645*e4b17023SJohn Marino || (!backward && HAVE_FORWARD_PREFETCH)) 646*e4b17023SJohn Marino { 647*e4b17023SJohn Marino ref->prefetch_before = 1; 648*e4b17023SJohn Marino return; 649*e4b17023SJohn Marino } 650*e4b17023SJohn Marino 651*e4b17023SJohn Marino ref->prefetch_mod = PREFETCH_BLOCK / step; 652*e4b17023SJohn Marino } 653*e4b17023SJohn Marino 654*e4b17023SJohn Marino /* Divides X by BY, rounding down. */ 655*e4b17023SJohn Marino 656*e4b17023SJohn Marino static HOST_WIDE_INT 657*e4b17023SJohn Marino ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) 658*e4b17023SJohn Marino { 659*e4b17023SJohn Marino gcc_assert (by > 0); 660*e4b17023SJohn Marino 661*e4b17023SJohn Marino if (x >= 0) 662*e4b17023SJohn Marino return x / by; 663*e4b17023SJohn Marino else 664*e4b17023SJohn Marino return (x + by - 1) / by; 665*e4b17023SJohn Marino } 666*e4b17023SJohn Marino 667*e4b17023SJohn Marino /* Given a CACHE_LINE_SIZE and two inductive memory references 668*e4b17023SJohn Marino with a common STEP greater than CACHE_LINE_SIZE and an address 669*e4b17023SJohn Marino difference DELTA, compute the probability that they will fall 670*e4b17023SJohn Marino in different cache lines. Return true if the computed miss rate 671*e4b17023SJohn Marino is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the 672*e4b17023SJohn Marino number of distinct iterations after which the pattern repeats itself. 673*e4b17023SJohn Marino ALIGN_UNIT is the unit of alignment in bytes. */ 674*e4b17023SJohn Marino 675*e4b17023SJohn Marino static bool 676*e4b17023SJohn Marino is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size, 677*e4b17023SJohn Marino HOST_WIDE_INT step, HOST_WIDE_INT delta, 678*e4b17023SJohn Marino unsigned HOST_WIDE_INT distinct_iters, 679*e4b17023SJohn Marino int align_unit) 680*e4b17023SJohn Marino { 681*e4b17023SJohn Marino unsigned align, iter; 682*e4b17023SJohn Marino int total_positions, miss_positions, max_allowed_miss_positions; 683*e4b17023SJohn Marino int address1, address2, cache_line1, cache_line2; 684*e4b17023SJohn Marino 685*e4b17023SJohn Marino /* It always misses if delta is greater than or equal to the cache 686*e4b17023SJohn Marino line size. */ 687*e4b17023SJohn Marino if (delta >= (HOST_WIDE_INT) cache_line_size) 688*e4b17023SJohn Marino return false; 689*e4b17023SJohn Marino 690*e4b17023SJohn Marino miss_positions = 0; 691*e4b17023SJohn Marino total_positions = (cache_line_size / align_unit) * distinct_iters; 692*e4b17023SJohn Marino max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000; 693*e4b17023SJohn Marino 694*e4b17023SJohn Marino /* Iterate through all possible alignments of the first 695*e4b17023SJohn Marino memory reference within its cache line. */ 696*e4b17023SJohn Marino for (align = 0; align < cache_line_size; align += align_unit) 697*e4b17023SJohn Marino 698*e4b17023SJohn Marino /* Iterate through all distinct iterations. */ 699*e4b17023SJohn Marino for (iter = 0; iter < distinct_iters; iter++) 700*e4b17023SJohn Marino { 701*e4b17023SJohn Marino address1 = align + step * iter; 702*e4b17023SJohn Marino address2 = address1 + delta; 703*e4b17023SJohn Marino cache_line1 = address1 / cache_line_size; 704*e4b17023SJohn Marino cache_line2 = address2 / cache_line_size; 705*e4b17023SJohn Marino if (cache_line1 != cache_line2) 706*e4b17023SJohn Marino { 707*e4b17023SJohn Marino miss_positions += 1; 708*e4b17023SJohn Marino if (miss_positions > max_allowed_miss_positions) 709*e4b17023SJohn Marino return false; 710*e4b17023SJohn Marino } 711*e4b17023SJohn Marino } 712*e4b17023SJohn Marino return true; 713*e4b17023SJohn Marino } 714*e4b17023SJohn Marino 715*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuse with BY. 716*e4b17023SJohn Marino If BY_IS_BEFORE is true, BY is before REF in the loop. */ 717*e4b17023SJohn Marino 718*e4b17023SJohn Marino static void 719*e4b17023SJohn Marino prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 720*e4b17023SJohn Marino bool by_is_before) 721*e4b17023SJohn Marino { 722*e4b17023SJohn Marino HOST_WIDE_INT step; 723*e4b17023SJohn Marino bool backward; 724*e4b17023SJohn Marino HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 725*e4b17023SJohn Marino HOST_WIDE_INT delta = delta_b - delta_r; 726*e4b17023SJohn Marino HOST_WIDE_INT hit_from; 727*e4b17023SJohn Marino unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 728*e4b17023SJohn Marino HOST_WIDE_INT reduced_step; 729*e4b17023SJohn Marino unsigned HOST_WIDE_INT reduced_prefetch_block; 730*e4b17023SJohn Marino tree ref_type; 731*e4b17023SJohn Marino int align_unit; 732*e4b17023SJohn Marino 733*e4b17023SJohn Marino /* If the step is non constant we cannot calculate prefetch_before. */ 734*e4b17023SJohn Marino if (!cst_and_fits_in_hwi (ref->group->step)) { 735*e4b17023SJohn Marino return; 736*e4b17023SJohn Marino } 737*e4b17023SJohn Marino 738*e4b17023SJohn Marino step = int_cst_value (ref->group->step); 739*e4b17023SJohn Marino 740*e4b17023SJohn Marino backward = step < 0; 741*e4b17023SJohn Marino 742*e4b17023SJohn Marino 743*e4b17023SJohn Marino if (delta == 0) 744*e4b17023SJohn Marino { 745*e4b17023SJohn Marino /* If the references has the same address, only prefetch the 746*e4b17023SJohn Marino former. */ 747*e4b17023SJohn Marino if (by_is_before) 748*e4b17023SJohn Marino ref->prefetch_before = 0; 749*e4b17023SJohn Marino 750*e4b17023SJohn Marino return; 751*e4b17023SJohn Marino } 752*e4b17023SJohn Marino 753*e4b17023SJohn Marino if (!step) 754*e4b17023SJohn Marino { 755*e4b17023SJohn Marino /* If the reference addresses are invariant and fall into the 756*e4b17023SJohn Marino same cache line, prefetch just the first one. */ 757*e4b17023SJohn Marino if (!by_is_before) 758*e4b17023SJohn Marino return; 759*e4b17023SJohn Marino 760*e4b17023SJohn Marino if (ddown (ref->delta, PREFETCH_BLOCK) 761*e4b17023SJohn Marino != ddown (by->delta, PREFETCH_BLOCK)) 762*e4b17023SJohn Marino return; 763*e4b17023SJohn Marino 764*e4b17023SJohn Marino ref->prefetch_before = 0; 765*e4b17023SJohn Marino return; 766*e4b17023SJohn Marino } 767*e4b17023SJohn Marino 768*e4b17023SJohn Marino /* Only prune the reference that is behind in the array. */ 769*e4b17023SJohn Marino if (backward) 770*e4b17023SJohn Marino { 771*e4b17023SJohn Marino if (delta > 0) 772*e4b17023SJohn Marino return; 773*e4b17023SJohn Marino 774*e4b17023SJohn Marino /* Transform the data so that we may assume that the accesses 775*e4b17023SJohn Marino are forward. */ 776*e4b17023SJohn Marino delta = - delta; 777*e4b17023SJohn Marino step = -step; 778*e4b17023SJohn Marino delta_r = PREFETCH_BLOCK - 1 - delta_r; 779*e4b17023SJohn Marino delta_b = PREFETCH_BLOCK - 1 - delta_b; 780*e4b17023SJohn Marino } 781*e4b17023SJohn Marino else 782*e4b17023SJohn Marino { 783*e4b17023SJohn Marino if (delta < 0) 784*e4b17023SJohn Marino return; 785*e4b17023SJohn Marino } 786*e4b17023SJohn Marino 787*e4b17023SJohn Marino /* Check whether the two references are likely to hit the same cache 788*e4b17023SJohn Marino line, and how distant the iterations in that it occurs are from 789*e4b17023SJohn Marino each other. */ 790*e4b17023SJohn Marino 791*e4b17023SJohn Marino if (step <= PREFETCH_BLOCK) 792*e4b17023SJohn Marino { 793*e4b17023SJohn Marino /* The accesses are sure to meet. Let us check when. */ 794*e4b17023SJohn Marino hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 795*e4b17023SJohn Marino prefetch_before = (hit_from - delta_r + step - 1) / step; 796*e4b17023SJohn Marino 797*e4b17023SJohn Marino /* Do not reduce prefetch_before if we meet beyond cache size. */ 798*e4b17023SJohn Marino if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step)) 799*e4b17023SJohn Marino prefetch_before = PREFETCH_ALL; 800*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before) 801*e4b17023SJohn Marino ref->prefetch_before = prefetch_before; 802*e4b17023SJohn Marino 803*e4b17023SJohn Marino return; 804*e4b17023SJohn Marino } 805*e4b17023SJohn Marino 806*e4b17023SJohn Marino /* A more complicated case with step > prefetch_block. First reduce 807*e4b17023SJohn Marino the ratio between the step and the cache line size to its simplest 808*e4b17023SJohn Marino terms. The resulting denominator will then represent the number of 809*e4b17023SJohn Marino distinct iterations after which each address will go back to its 810*e4b17023SJohn Marino initial location within the cache line. This computation assumes 811*e4b17023SJohn Marino that PREFETCH_BLOCK is a power of two. */ 812*e4b17023SJohn Marino prefetch_block = PREFETCH_BLOCK; 813*e4b17023SJohn Marino reduced_prefetch_block = prefetch_block; 814*e4b17023SJohn Marino reduced_step = step; 815*e4b17023SJohn Marino while ((reduced_step & 1) == 0 816*e4b17023SJohn Marino && reduced_prefetch_block > 1) 817*e4b17023SJohn Marino { 818*e4b17023SJohn Marino reduced_step >>= 1; 819*e4b17023SJohn Marino reduced_prefetch_block >>= 1; 820*e4b17023SJohn Marino } 821*e4b17023SJohn Marino 822*e4b17023SJohn Marino prefetch_before = delta / step; 823*e4b17023SJohn Marino delta %= step; 824*e4b17023SJohn Marino ref_type = TREE_TYPE (ref->mem); 825*e4b17023SJohn Marino align_unit = TYPE_ALIGN (ref_type) / 8; 826*e4b17023SJohn Marino if (is_miss_rate_acceptable (prefetch_block, step, delta, 827*e4b17023SJohn Marino reduced_prefetch_block, align_unit)) 828*e4b17023SJohn Marino { 829*e4b17023SJohn Marino /* Do not reduce prefetch_before if we meet beyond cache size. */ 830*e4b17023SJohn Marino if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) 831*e4b17023SJohn Marino prefetch_before = PREFETCH_ALL; 832*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before) 833*e4b17023SJohn Marino ref->prefetch_before = prefetch_before; 834*e4b17023SJohn Marino 835*e4b17023SJohn Marino return; 836*e4b17023SJohn Marino } 837*e4b17023SJohn Marino 838*e4b17023SJohn Marino /* Try also the following iteration. */ 839*e4b17023SJohn Marino prefetch_before++; 840*e4b17023SJohn Marino delta = step - delta; 841*e4b17023SJohn Marino if (is_miss_rate_acceptable (prefetch_block, step, delta, 842*e4b17023SJohn Marino reduced_prefetch_block, align_unit)) 843*e4b17023SJohn Marino { 844*e4b17023SJohn Marino if (prefetch_before < ref->prefetch_before) 845*e4b17023SJohn Marino ref->prefetch_before = prefetch_before; 846*e4b17023SJohn Marino 847*e4b17023SJohn Marino return; 848*e4b17023SJohn Marino } 849*e4b17023SJohn Marino 850*e4b17023SJohn Marino /* The ref probably does not reuse by. */ 851*e4b17023SJohn Marino return; 852*e4b17023SJohn Marino } 853*e4b17023SJohn Marino 854*e4b17023SJohn Marino /* Prune the prefetch candidate REF using the reuses with other references 855*e4b17023SJohn Marino in REFS. */ 856*e4b17023SJohn Marino 857*e4b17023SJohn Marino static void 858*e4b17023SJohn Marino prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) 859*e4b17023SJohn Marino { 860*e4b17023SJohn Marino struct mem_ref *prune_by; 861*e4b17023SJohn Marino bool before = true; 862*e4b17023SJohn Marino 863*e4b17023SJohn Marino prune_ref_by_self_reuse (ref); 864*e4b17023SJohn Marino 865*e4b17023SJohn Marino for (prune_by = refs; prune_by; prune_by = prune_by->next) 866*e4b17023SJohn Marino { 867*e4b17023SJohn Marino if (prune_by == ref) 868*e4b17023SJohn Marino { 869*e4b17023SJohn Marino before = false; 870*e4b17023SJohn Marino continue; 871*e4b17023SJohn Marino } 872*e4b17023SJohn Marino 873*e4b17023SJohn Marino if (!WRITE_CAN_USE_READ_PREFETCH 874*e4b17023SJohn Marino && ref->write_p 875*e4b17023SJohn Marino && !prune_by->write_p) 876*e4b17023SJohn Marino continue; 877*e4b17023SJohn Marino if (!READ_CAN_USE_WRITE_PREFETCH 878*e4b17023SJohn Marino && !ref->write_p 879*e4b17023SJohn Marino && prune_by->write_p) 880*e4b17023SJohn Marino continue; 881*e4b17023SJohn Marino 882*e4b17023SJohn Marino prune_ref_by_group_reuse (ref, prune_by, before); 883*e4b17023SJohn Marino } 884*e4b17023SJohn Marino } 885*e4b17023SJohn Marino 886*e4b17023SJohn Marino /* Prune the prefetch candidates in GROUP using the reuse analysis. */ 887*e4b17023SJohn Marino 888*e4b17023SJohn Marino static void 889*e4b17023SJohn Marino prune_group_by_reuse (struct mem_ref_group *group) 890*e4b17023SJohn Marino { 891*e4b17023SJohn Marino struct mem_ref *ref_pruned; 892*e4b17023SJohn Marino 893*e4b17023SJohn Marino for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) 894*e4b17023SJohn Marino { 895*e4b17023SJohn Marino prune_ref_by_reuse (ref_pruned, group->refs); 896*e4b17023SJohn Marino 897*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 898*e4b17023SJohn Marino { 899*e4b17023SJohn Marino fprintf (dump_file, "Reference %p:", (void *) ref_pruned); 900*e4b17023SJohn Marino 901*e4b17023SJohn Marino if (ref_pruned->prefetch_before == PREFETCH_ALL 902*e4b17023SJohn Marino && ref_pruned->prefetch_mod == 1) 903*e4b17023SJohn Marino fprintf (dump_file, " no restrictions"); 904*e4b17023SJohn Marino else if (ref_pruned->prefetch_before == 0) 905*e4b17023SJohn Marino fprintf (dump_file, " do not prefetch"); 906*e4b17023SJohn Marino else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) 907*e4b17023SJohn Marino fprintf (dump_file, " prefetch once"); 908*e4b17023SJohn Marino else 909*e4b17023SJohn Marino { 910*e4b17023SJohn Marino if (ref_pruned->prefetch_before != PREFETCH_ALL) 911*e4b17023SJohn Marino { 912*e4b17023SJohn Marino fprintf (dump_file, " prefetch before "); 913*e4b17023SJohn Marino fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 914*e4b17023SJohn Marino ref_pruned->prefetch_before); 915*e4b17023SJohn Marino } 916*e4b17023SJohn Marino if (ref_pruned->prefetch_mod != 1) 917*e4b17023SJohn Marino { 918*e4b17023SJohn Marino fprintf (dump_file, " prefetch mod "); 919*e4b17023SJohn Marino fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 920*e4b17023SJohn Marino ref_pruned->prefetch_mod); 921*e4b17023SJohn Marino } 922*e4b17023SJohn Marino } 923*e4b17023SJohn Marino fprintf (dump_file, "\n"); 924*e4b17023SJohn Marino } 925*e4b17023SJohn Marino } 926*e4b17023SJohn Marino } 927*e4b17023SJohn Marino 928*e4b17023SJohn Marino /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ 929*e4b17023SJohn Marino 930*e4b17023SJohn Marino static void 931*e4b17023SJohn Marino prune_by_reuse (struct mem_ref_group *groups) 932*e4b17023SJohn Marino { 933*e4b17023SJohn Marino for (; groups; groups = groups->next) 934*e4b17023SJohn Marino prune_group_by_reuse (groups); 935*e4b17023SJohn Marino } 936*e4b17023SJohn Marino 937*e4b17023SJohn Marino /* Returns true if we should issue prefetch for REF. */ 938*e4b17023SJohn Marino 939*e4b17023SJohn Marino static bool 940*e4b17023SJohn Marino should_issue_prefetch_p (struct mem_ref *ref) 941*e4b17023SJohn Marino { 942*e4b17023SJohn Marino /* For now do not issue prefetches for only first few of the 943*e4b17023SJohn Marino iterations. */ 944*e4b17023SJohn Marino if (ref->prefetch_before != PREFETCH_ALL) 945*e4b17023SJohn Marino { 946*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 947*e4b17023SJohn Marino fprintf (dump_file, "Ignoring %p due to prefetch_before\n", 948*e4b17023SJohn Marino (void *) ref); 949*e4b17023SJohn Marino return false; 950*e4b17023SJohn Marino } 951*e4b17023SJohn Marino 952*e4b17023SJohn Marino /* Do not prefetch nontemporal stores. */ 953*e4b17023SJohn Marino if (ref->storent_p) 954*e4b17023SJohn Marino { 955*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 956*e4b17023SJohn Marino fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref); 957*e4b17023SJohn Marino return false; 958*e4b17023SJohn Marino } 959*e4b17023SJohn Marino 960*e4b17023SJohn Marino return true; 961*e4b17023SJohn Marino } 962*e4b17023SJohn Marino 963*e4b17023SJohn Marino /* Decide which of the prefetch candidates in GROUPS to prefetch. 964*e4b17023SJohn Marino AHEAD is the number of iterations to prefetch ahead (which corresponds 965*e4b17023SJohn Marino to the number of simultaneous instances of one prefetch running at a 966*e4b17023SJohn Marino time). UNROLL_FACTOR is the factor by that the loop is going to be 967*e4b17023SJohn Marino unrolled. Returns true if there is anything to prefetch. */ 968*e4b17023SJohn Marino 969*e4b17023SJohn Marino static bool 970*e4b17023SJohn Marino schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, 971*e4b17023SJohn Marino unsigned ahead) 972*e4b17023SJohn Marino { 973*e4b17023SJohn Marino unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots; 974*e4b17023SJohn Marino unsigned slots_per_prefetch; 975*e4b17023SJohn Marino struct mem_ref *ref; 976*e4b17023SJohn Marino bool any = false; 977*e4b17023SJohn Marino 978*e4b17023SJohn Marino /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */ 979*e4b17023SJohn Marino remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES; 980*e4b17023SJohn Marino 981*e4b17023SJohn Marino /* The prefetch will run for AHEAD iterations of the original loop, i.e., 982*e4b17023SJohn Marino AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, 983*e4b17023SJohn Marino it will need a prefetch slot. */ 984*e4b17023SJohn Marino slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor; 985*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 986*e4b17023SJohn Marino fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n", 987*e4b17023SJohn Marino slots_per_prefetch); 988*e4b17023SJohn Marino 989*e4b17023SJohn Marino /* For now we just take memory references one by one and issue 990*e4b17023SJohn Marino prefetches for as many as possible. The groups are sorted 991*e4b17023SJohn Marino starting with the largest step, since the references with 992*e4b17023SJohn Marino large step are more likely to cause many cache misses. */ 993*e4b17023SJohn Marino 994*e4b17023SJohn Marino for (; groups; groups = groups->next) 995*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next) 996*e4b17023SJohn Marino { 997*e4b17023SJohn Marino if (!should_issue_prefetch_p (ref)) 998*e4b17023SJohn Marino continue; 999*e4b17023SJohn Marino 1000*e4b17023SJohn Marino /* The loop is far from being sufficiently unrolled for this 1001*e4b17023SJohn Marino prefetch. Do not generate prefetch to avoid many redudant 1002*e4b17023SJohn Marino prefetches. */ 1003*e4b17023SJohn Marino if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO) 1004*e4b17023SJohn Marino continue; 1005*e4b17023SJohn Marino 1006*e4b17023SJohn Marino /* If we need to prefetch the reference each PREFETCH_MOD iterations, 1007*e4b17023SJohn Marino and we unroll the loop UNROLL_FACTOR times, we need to insert 1008*e4b17023SJohn Marino ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each 1009*e4b17023SJohn Marino iteration. */ 1010*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1011*e4b17023SJohn Marino / ref->prefetch_mod); 1012*e4b17023SJohn Marino prefetch_slots = n_prefetches * slots_per_prefetch; 1013*e4b17023SJohn Marino 1014*e4b17023SJohn Marino /* If more than half of the prefetches would be lost anyway, do not 1015*e4b17023SJohn Marino issue the prefetch. */ 1016*e4b17023SJohn Marino if (2 * remaining_prefetch_slots < prefetch_slots) 1017*e4b17023SJohn Marino continue; 1018*e4b17023SJohn Marino 1019*e4b17023SJohn Marino ref->issue_prefetch_p = true; 1020*e4b17023SJohn Marino 1021*e4b17023SJohn Marino if (remaining_prefetch_slots <= prefetch_slots) 1022*e4b17023SJohn Marino return true; 1023*e4b17023SJohn Marino remaining_prefetch_slots -= prefetch_slots; 1024*e4b17023SJohn Marino any = true; 1025*e4b17023SJohn Marino } 1026*e4b17023SJohn Marino 1027*e4b17023SJohn Marino return any; 1028*e4b17023SJohn Marino } 1029*e4b17023SJohn Marino 1030*e4b17023SJohn Marino /* Return TRUE if no prefetch is going to be generated in the given 1031*e4b17023SJohn Marino GROUPS. */ 1032*e4b17023SJohn Marino 1033*e4b17023SJohn Marino static bool 1034*e4b17023SJohn Marino nothing_to_prefetch_p (struct mem_ref_group *groups) 1035*e4b17023SJohn Marino { 1036*e4b17023SJohn Marino struct mem_ref *ref; 1037*e4b17023SJohn Marino 1038*e4b17023SJohn Marino for (; groups; groups = groups->next) 1039*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next) 1040*e4b17023SJohn Marino if (should_issue_prefetch_p (ref)) 1041*e4b17023SJohn Marino return false; 1042*e4b17023SJohn Marino 1043*e4b17023SJohn Marino return true; 1044*e4b17023SJohn Marino } 1045*e4b17023SJohn Marino 1046*e4b17023SJohn Marino /* Estimate the number of prefetches in the given GROUPS. 1047*e4b17023SJohn Marino UNROLL_FACTOR is the factor by which LOOP was unrolled. */ 1048*e4b17023SJohn Marino 1049*e4b17023SJohn Marino static int 1050*e4b17023SJohn Marino estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor) 1051*e4b17023SJohn Marino { 1052*e4b17023SJohn Marino struct mem_ref *ref; 1053*e4b17023SJohn Marino unsigned n_prefetches; 1054*e4b17023SJohn Marino int prefetch_count = 0; 1055*e4b17023SJohn Marino 1056*e4b17023SJohn Marino for (; groups; groups = groups->next) 1057*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next) 1058*e4b17023SJohn Marino if (should_issue_prefetch_p (ref)) 1059*e4b17023SJohn Marino { 1060*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1061*e4b17023SJohn Marino / ref->prefetch_mod); 1062*e4b17023SJohn Marino prefetch_count += n_prefetches; 1063*e4b17023SJohn Marino } 1064*e4b17023SJohn Marino 1065*e4b17023SJohn Marino return prefetch_count; 1066*e4b17023SJohn Marino } 1067*e4b17023SJohn Marino 1068*e4b17023SJohn Marino /* Issue prefetches for the reference REF into loop as decided before. 1069*e4b17023SJohn Marino HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR 1070*e4b17023SJohn Marino is the factor by which LOOP was unrolled. */ 1071*e4b17023SJohn Marino 1072*e4b17023SJohn Marino static void 1073*e4b17023SJohn Marino issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 1074*e4b17023SJohn Marino { 1075*e4b17023SJohn Marino HOST_WIDE_INT delta; 1076*e4b17023SJohn Marino tree addr, addr_base, write_p, local, forward; 1077*e4b17023SJohn Marino gimple prefetch; 1078*e4b17023SJohn Marino gimple_stmt_iterator bsi; 1079*e4b17023SJohn Marino unsigned n_prefetches, ap; 1080*e4b17023SJohn Marino bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; 1081*e4b17023SJohn Marino 1082*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1083*e4b17023SJohn Marino fprintf (dump_file, "Issued%s prefetch for %p.\n", 1084*e4b17023SJohn Marino nontemporal ? " nontemporal" : "", 1085*e4b17023SJohn Marino (void *) ref); 1086*e4b17023SJohn Marino 1087*e4b17023SJohn Marino bsi = gsi_for_stmt (ref->stmt); 1088*e4b17023SJohn Marino 1089*e4b17023SJohn Marino n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1090*e4b17023SJohn Marino / ref->prefetch_mod); 1091*e4b17023SJohn Marino addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 1092*e4b17023SJohn Marino addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), 1093*e4b17023SJohn Marino true, NULL, true, GSI_SAME_STMT); 1094*e4b17023SJohn Marino write_p = ref->write_p ? integer_one_node : integer_zero_node; 1095*e4b17023SJohn Marino local = nontemporal ? integer_zero_node : integer_three_node; 1096*e4b17023SJohn Marino 1097*e4b17023SJohn Marino for (ap = 0; ap < n_prefetches; ap++) 1098*e4b17023SJohn Marino { 1099*e4b17023SJohn Marino if (cst_and_fits_in_hwi (ref->group->step)) 1100*e4b17023SJohn Marino { 1101*e4b17023SJohn Marino /* Determine the address to prefetch. */ 1102*e4b17023SJohn Marino delta = (ahead + ap * ref->prefetch_mod) * 1103*e4b17023SJohn Marino int_cst_value (ref->group->step); 1104*e4b17023SJohn Marino addr = fold_build_pointer_plus_hwi (addr_base, delta); 1105*e4b17023SJohn Marino addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL, 1106*e4b17023SJohn Marino true, GSI_SAME_STMT); 1107*e4b17023SJohn Marino } 1108*e4b17023SJohn Marino else 1109*e4b17023SJohn Marino { 1110*e4b17023SJohn Marino /* The step size is non-constant but loop-invariant. We use the 1111*e4b17023SJohn Marino heuristic to simply prefetch ahead iterations ahead. */ 1112*e4b17023SJohn Marino forward = fold_build2 (MULT_EXPR, sizetype, 1113*e4b17023SJohn Marino fold_convert (sizetype, ref->group->step), 1114*e4b17023SJohn Marino fold_convert (sizetype, size_int (ahead))); 1115*e4b17023SJohn Marino addr = fold_build_pointer_plus (addr_base, forward); 1116*e4b17023SJohn Marino addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, 1117*e4b17023SJohn Marino NULL, true, GSI_SAME_STMT); 1118*e4b17023SJohn Marino } 1119*e4b17023SJohn Marino /* Create the prefetch instruction. */ 1120*e4b17023SJohn Marino prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 1121*e4b17023SJohn Marino 3, addr, write_p, local); 1122*e4b17023SJohn Marino gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); 1123*e4b17023SJohn Marino } 1124*e4b17023SJohn Marino } 1125*e4b17023SJohn Marino 1126*e4b17023SJohn Marino /* Issue prefetches for the references in GROUPS into loop as decided before. 1127*e4b17023SJohn Marino HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the 1128*e4b17023SJohn Marino factor by that LOOP was unrolled. */ 1129*e4b17023SJohn Marino 1130*e4b17023SJohn Marino static void 1131*e4b17023SJohn Marino issue_prefetches (struct mem_ref_group *groups, 1132*e4b17023SJohn Marino unsigned unroll_factor, unsigned ahead) 1133*e4b17023SJohn Marino { 1134*e4b17023SJohn Marino struct mem_ref *ref; 1135*e4b17023SJohn Marino 1136*e4b17023SJohn Marino for (; groups; groups = groups->next) 1137*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next) 1138*e4b17023SJohn Marino if (ref->issue_prefetch_p) 1139*e4b17023SJohn Marino issue_prefetch_ref (ref, unroll_factor, ahead); 1140*e4b17023SJohn Marino } 1141*e4b17023SJohn Marino 1142*e4b17023SJohn Marino /* Returns true if REF is a memory write for that a nontemporal store insn 1143*e4b17023SJohn Marino can be used. */ 1144*e4b17023SJohn Marino 1145*e4b17023SJohn Marino static bool 1146*e4b17023SJohn Marino nontemporal_store_p (struct mem_ref *ref) 1147*e4b17023SJohn Marino { 1148*e4b17023SJohn Marino enum machine_mode mode; 1149*e4b17023SJohn Marino enum insn_code code; 1150*e4b17023SJohn Marino 1151*e4b17023SJohn Marino /* REF must be a write that is not reused. We require it to be independent 1152*e4b17023SJohn Marino on all other memory references in the loop, as the nontemporal stores may 1153*e4b17023SJohn Marino be reordered with respect to other memory references. */ 1154*e4b17023SJohn Marino if (!ref->write_p 1155*e4b17023SJohn Marino || !ref->independent_p 1156*e4b17023SJohn Marino || ref->reuse_distance < L2_CACHE_SIZE_BYTES) 1157*e4b17023SJohn Marino return false; 1158*e4b17023SJohn Marino 1159*e4b17023SJohn Marino /* Check that we have the storent instruction for the mode. */ 1160*e4b17023SJohn Marino mode = TYPE_MODE (TREE_TYPE (ref->mem)); 1161*e4b17023SJohn Marino if (mode == BLKmode) 1162*e4b17023SJohn Marino return false; 1163*e4b17023SJohn Marino 1164*e4b17023SJohn Marino code = optab_handler (storent_optab, mode); 1165*e4b17023SJohn Marino return code != CODE_FOR_nothing; 1166*e4b17023SJohn Marino } 1167*e4b17023SJohn Marino 1168*e4b17023SJohn Marino /* If REF is a nontemporal store, we mark the corresponding modify statement 1169*e4b17023SJohn Marino and return true. Otherwise, we return false. */ 1170*e4b17023SJohn Marino 1171*e4b17023SJohn Marino static bool 1172*e4b17023SJohn Marino mark_nontemporal_store (struct mem_ref *ref) 1173*e4b17023SJohn Marino { 1174*e4b17023SJohn Marino if (!nontemporal_store_p (ref)) 1175*e4b17023SJohn Marino return false; 1176*e4b17023SJohn Marino 1177*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1178*e4b17023SJohn Marino fprintf (dump_file, "Marked reference %p as a nontemporal store.\n", 1179*e4b17023SJohn Marino (void *) ref); 1180*e4b17023SJohn Marino 1181*e4b17023SJohn Marino gimple_assign_set_nontemporal_move (ref->stmt, true); 1182*e4b17023SJohn Marino ref->storent_p = true; 1183*e4b17023SJohn Marino 1184*e4b17023SJohn Marino return true; 1185*e4b17023SJohn Marino } 1186*e4b17023SJohn Marino 1187*e4b17023SJohn Marino /* Issue a memory fence instruction after LOOP. */ 1188*e4b17023SJohn Marino 1189*e4b17023SJohn Marino static void 1190*e4b17023SJohn Marino emit_mfence_after_loop (struct loop *loop) 1191*e4b17023SJohn Marino { 1192*e4b17023SJohn Marino VEC (edge, heap) *exits = get_loop_exit_edges (loop); 1193*e4b17023SJohn Marino edge exit; 1194*e4b17023SJohn Marino gimple call; 1195*e4b17023SJohn Marino gimple_stmt_iterator bsi; 1196*e4b17023SJohn Marino unsigned i; 1197*e4b17023SJohn Marino 1198*e4b17023SJohn Marino FOR_EACH_VEC_ELT (edge, exits, i, exit) 1199*e4b17023SJohn Marino { 1200*e4b17023SJohn Marino call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); 1201*e4b17023SJohn Marino 1202*e4b17023SJohn Marino if (!single_pred_p (exit->dest) 1203*e4b17023SJohn Marino /* If possible, we prefer not to insert the fence on other paths 1204*e4b17023SJohn Marino in cfg. */ 1205*e4b17023SJohn Marino && !(exit->flags & EDGE_ABNORMAL)) 1206*e4b17023SJohn Marino split_loop_exit_edge (exit); 1207*e4b17023SJohn Marino bsi = gsi_after_labels (exit->dest); 1208*e4b17023SJohn Marino 1209*e4b17023SJohn Marino gsi_insert_before (&bsi, call, GSI_NEW_STMT); 1210*e4b17023SJohn Marino mark_virtual_ops_for_renaming (call); 1211*e4b17023SJohn Marino } 1212*e4b17023SJohn Marino 1213*e4b17023SJohn Marino VEC_free (edge, heap, exits); 1214*e4b17023SJohn Marino update_ssa (TODO_update_ssa_only_virtuals); 1215*e4b17023SJohn Marino } 1216*e4b17023SJohn Marino 1217*e4b17023SJohn Marino /* Returns true if we can use storent in loop, false otherwise. */ 1218*e4b17023SJohn Marino 1219*e4b17023SJohn Marino static bool 1220*e4b17023SJohn Marino may_use_storent_in_loop_p (struct loop *loop) 1221*e4b17023SJohn Marino { 1222*e4b17023SJohn Marino bool ret = true; 1223*e4b17023SJohn Marino 1224*e4b17023SJohn Marino if (loop->inner != NULL) 1225*e4b17023SJohn Marino return false; 1226*e4b17023SJohn Marino 1227*e4b17023SJohn Marino /* If we must issue a mfence insn after using storent, check that there 1228*e4b17023SJohn Marino is a suitable place for it at each of the loop exits. */ 1229*e4b17023SJohn Marino if (FENCE_FOLLOWING_MOVNT != NULL_TREE) 1230*e4b17023SJohn Marino { 1231*e4b17023SJohn Marino VEC (edge, heap) *exits = get_loop_exit_edges (loop); 1232*e4b17023SJohn Marino unsigned i; 1233*e4b17023SJohn Marino edge exit; 1234*e4b17023SJohn Marino 1235*e4b17023SJohn Marino FOR_EACH_VEC_ELT (edge, exits, i, exit) 1236*e4b17023SJohn Marino if ((exit->flags & EDGE_ABNORMAL) 1237*e4b17023SJohn Marino && exit->dest == EXIT_BLOCK_PTR) 1238*e4b17023SJohn Marino ret = false; 1239*e4b17023SJohn Marino 1240*e4b17023SJohn Marino VEC_free (edge, heap, exits); 1241*e4b17023SJohn Marino } 1242*e4b17023SJohn Marino 1243*e4b17023SJohn Marino return ret; 1244*e4b17023SJohn Marino } 1245*e4b17023SJohn Marino 1246*e4b17023SJohn Marino /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory 1247*e4b17023SJohn Marino references in the loop. */ 1248*e4b17023SJohn Marino 1249*e4b17023SJohn Marino static void 1250*e4b17023SJohn Marino mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups) 1251*e4b17023SJohn Marino { 1252*e4b17023SJohn Marino struct mem_ref *ref; 1253*e4b17023SJohn Marino bool any = false; 1254*e4b17023SJohn Marino 1255*e4b17023SJohn Marino if (!may_use_storent_in_loop_p (loop)) 1256*e4b17023SJohn Marino return; 1257*e4b17023SJohn Marino 1258*e4b17023SJohn Marino for (; groups; groups = groups->next) 1259*e4b17023SJohn Marino for (ref = groups->refs; ref; ref = ref->next) 1260*e4b17023SJohn Marino any |= mark_nontemporal_store (ref); 1261*e4b17023SJohn Marino 1262*e4b17023SJohn Marino if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE) 1263*e4b17023SJohn Marino emit_mfence_after_loop (loop); 1264*e4b17023SJohn Marino } 1265*e4b17023SJohn Marino 1266*e4b17023SJohn Marino /* Determines whether we can profitably unroll LOOP FACTOR times, and if 1267*e4b17023SJohn Marino this is the case, fill in DESC by the description of number of 1268*e4b17023SJohn Marino iterations. */ 1269*e4b17023SJohn Marino 1270*e4b17023SJohn Marino static bool 1271*e4b17023SJohn Marino should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, 1272*e4b17023SJohn Marino unsigned factor) 1273*e4b17023SJohn Marino { 1274*e4b17023SJohn Marino if (!can_unroll_loop_p (loop, factor, desc)) 1275*e4b17023SJohn Marino return false; 1276*e4b17023SJohn Marino 1277*e4b17023SJohn Marino /* We only consider loops without control flow for unrolling. This is not 1278*e4b17023SJohn Marino a hard restriction -- tree_unroll_loop works with arbitrary loops 1279*e4b17023SJohn Marino as well; but the unrolling/prefetching is usually more profitable for 1280*e4b17023SJohn Marino loops consisting of a single basic block, and we want to limit the 1281*e4b17023SJohn Marino code growth. */ 1282*e4b17023SJohn Marino if (loop->num_nodes > 2) 1283*e4b17023SJohn Marino return false; 1284*e4b17023SJohn Marino 1285*e4b17023SJohn Marino return true; 1286*e4b17023SJohn Marino } 1287*e4b17023SJohn Marino 1288*e4b17023SJohn Marino /* Determine the coefficient by that unroll LOOP, from the information 1289*e4b17023SJohn Marino contained in the list of memory references REFS. Description of 1290*e4b17023SJohn Marino umber of iterations of LOOP is stored to DESC. NINSNS is the number of 1291*e4b17023SJohn Marino insns of the LOOP. EST_NITER is the estimated number of iterations of 1292*e4b17023SJohn Marino the loop, or -1 if no estimate is available. */ 1293*e4b17023SJohn Marino 1294*e4b17023SJohn Marino static unsigned 1295*e4b17023SJohn Marino determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, 1296*e4b17023SJohn Marino unsigned ninsns, struct tree_niter_desc *desc, 1297*e4b17023SJohn Marino HOST_WIDE_INT est_niter) 1298*e4b17023SJohn Marino { 1299*e4b17023SJohn Marino unsigned upper_bound; 1300*e4b17023SJohn Marino unsigned nfactor, factor, mod_constraint; 1301*e4b17023SJohn Marino struct mem_ref_group *agp; 1302*e4b17023SJohn Marino struct mem_ref *ref; 1303*e4b17023SJohn Marino 1304*e4b17023SJohn Marino /* First check whether the loop is not too large to unroll. We ignore 1305*e4b17023SJohn Marino PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us 1306*e4b17023SJohn Marino from unrolling them enough to make exactly one cache line covered by each 1307*e4b17023SJohn Marino iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent 1308*e4b17023SJohn Marino us from unrolling the loops too many times in cases where we only expect 1309*e4b17023SJohn Marino gains from better scheduling and decreasing loop overhead, which is not 1310*e4b17023SJohn Marino the case here. */ 1311*e4b17023SJohn Marino upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; 1312*e4b17023SJohn Marino 1313*e4b17023SJohn Marino /* If we unrolled the loop more times than it iterates, the unrolled version 1314*e4b17023SJohn Marino of the loop would be never entered. */ 1315*e4b17023SJohn Marino if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound) 1316*e4b17023SJohn Marino upper_bound = est_niter; 1317*e4b17023SJohn Marino 1318*e4b17023SJohn Marino if (upper_bound <= 1) 1319*e4b17023SJohn Marino return 1; 1320*e4b17023SJohn Marino 1321*e4b17023SJohn Marino /* Choose the factor so that we may prefetch each cache just once, 1322*e4b17023SJohn Marino but bound the unrolling by UPPER_BOUND. */ 1323*e4b17023SJohn Marino factor = 1; 1324*e4b17023SJohn Marino for (agp = refs; agp; agp = agp->next) 1325*e4b17023SJohn Marino for (ref = agp->refs; ref; ref = ref->next) 1326*e4b17023SJohn Marino if (should_issue_prefetch_p (ref)) 1327*e4b17023SJohn Marino { 1328*e4b17023SJohn Marino mod_constraint = ref->prefetch_mod; 1329*e4b17023SJohn Marino nfactor = least_common_multiple (mod_constraint, factor); 1330*e4b17023SJohn Marino if (nfactor <= upper_bound) 1331*e4b17023SJohn Marino factor = nfactor; 1332*e4b17023SJohn Marino } 1333*e4b17023SJohn Marino 1334*e4b17023SJohn Marino if (!should_unroll_loop_p (loop, desc, factor)) 1335*e4b17023SJohn Marino return 1; 1336*e4b17023SJohn Marino 1337*e4b17023SJohn Marino return factor; 1338*e4b17023SJohn Marino } 1339*e4b17023SJohn Marino 1340*e4b17023SJohn Marino /* Returns the total volume of the memory references REFS, taking into account 1341*e4b17023SJohn Marino reuses in the innermost loop and cache line size. TODO -- we should also 1342*e4b17023SJohn Marino take into account reuses across the iterations of the loops in the loop 1343*e4b17023SJohn Marino nest. */ 1344*e4b17023SJohn Marino 1345*e4b17023SJohn Marino static unsigned 1346*e4b17023SJohn Marino volume_of_references (struct mem_ref_group *refs) 1347*e4b17023SJohn Marino { 1348*e4b17023SJohn Marino unsigned volume = 0; 1349*e4b17023SJohn Marino struct mem_ref_group *gr; 1350*e4b17023SJohn Marino struct mem_ref *ref; 1351*e4b17023SJohn Marino 1352*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next) 1353*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next) 1354*e4b17023SJohn Marino { 1355*e4b17023SJohn Marino /* Almost always reuses another value? */ 1356*e4b17023SJohn Marino if (ref->prefetch_before != PREFETCH_ALL) 1357*e4b17023SJohn Marino continue; 1358*e4b17023SJohn Marino 1359*e4b17023SJohn Marino /* If several iterations access the same cache line, use the size of 1360*e4b17023SJohn Marino the line divided by this number. Otherwise, a cache line is 1361*e4b17023SJohn Marino accessed in each iteration. TODO -- in the latter case, we should 1362*e4b17023SJohn Marino take the size of the reference into account, rounding it up on cache 1363*e4b17023SJohn Marino line size multiple. */ 1364*e4b17023SJohn Marino volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod; 1365*e4b17023SJohn Marino } 1366*e4b17023SJohn Marino return volume; 1367*e4b17023SJohn Marino } 1368*e4b17023SJohn Marino 1369*e4b17023SJohn Marino /* Returns the volume of memory references accessed across VEC iterations of 1370*e4b17023SJohn Marino loops, whose sizes are described in the LOOP_SIZES array. N is the number 1371*e4b17023SJohn Marino of the loops in the nest (length of VEC and LOOP_SIZES vectors). */ 1372*e4b17023SJohn Marino 1373*e4b17023SJohn Marino static unsigned 1374*e4b17023SJohn Marino volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n) 1375*e4b17023SJohn Marino { 1376*e4b17023SJohn Marino unsigned i; 1377*e4b17023SJohn Marino 1378*e4b17023SJohn Marino for (i = 0; i < n; i++) 1379*e4b17023SJohn Marino if (vec[i] != 0) 1380*e4b17023SJohn Marino break; 1381*e4b17023SJohn Marino 1382*e4b17023SJohn Marino if (i == n) 1383*e4b17023SJohn Marino return 0; 1384*e4b17023SJohn Marino 1385*e4b17023SJohn Marino gcc_assert (vec[i] > 0); 1386*e4b17023SJohn Marino 1387*e4b17023SJohn Marino /* We ignore the parts of the distance vector in subloops, since usually 1388*e4b17023SJohn Marino the numbers of iterations are much smaller. */ 1389*e4b17023SJohn Marino return loop_sizes[i] * vec[i]; 1390*e4b17023SJohn Marino } 1391*e4b17023SJohn Marino 1392*e4b17023SJohn Marino /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE 1393*e4b17023SJohn Marino at the position corresponding to the loop of the step. N is the depth 1394*e4b17023SJohn Marino of the considered loop nest, and, LOOP is its innermost loop. */ 1395*e4b17023SJohn Marino 1396*e4b17023SJohn Marino static void 1397*e4b17023SJohn Marino add_subscript_strides (tree access_fn, unsigned stride, 1398*e4b17023SJohn Marino HOST_WIDE_INT *strides, unsigned n, struct loop *loop) 1399*e4b17023SJohn Marino { 1400*e4b17023SJohn Marino struct loop *aloop; 1401*e4b17023SJohn Marino tree step; 1402*e4b17023SJohn Marino HOST_WIDE_INT astep; 1403*e4b17023SJohn Marino unsigned min_depth = loop_depth (loop) - n; 1404*e4b17023SJohn Marino 1405*e4b17023SJohn Marino while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC) 1406*e4b17023SJohn Marino { 1407*e4b17023SJohn Marino aloop = get_chrec_loop (access_fn); 1408*e4b17023SJohn Marino step = CHREC_RIGHT (access_fn); 1409*e4b17023SJohn Marino access_fn = CHREC_LEFT (access_fn); 1410*e4b17023SJohn Marino 1411*e4b17023SJohn Marino if ((unsigned) loop_depth (aloop) <= min_depth) 1412*e4b17023SJohn Marino continue; 1413*e4b17023SJohn Marino 1414*e4b17023SJohn Marino if (host_integerp (step, 0)) 1415*e4b17023SJohn Marino astep = tree_low_cst (step, 0); 1416*e4b17023SJohn Marino else 1417*e4b17023SJohn Marino astep = L1_CACHE_LINE_SIZE; 1418*e4b17023SJohn Marino 1419*e4b17023SJohn Marino strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride; 1420*e4b17023SJohn Marino 1421*e4b17023SJohn Marino } 1422*e4b17023SJohn Marino } 1423*e4b17023SJohn Marino 1424*e4b17023SJohn Marino /* Returns the volume of memory references accessed between two consecutive 1425*e4b17023SJohn Marino self-reuses of the reference DR. We consider the subscripts of DR in N 1426*e4b17023SJohn Marino loops, and LOOP_SIZES contains the volumes of accesses in each of the 1427*e4b17023SJohn Marino loops. LOOP is the innermost loop of the current loop nest. */ 1428*e4b17023SJohn Marino 1429*e4b17023SJohn Marino static unsigned 1430*e4b17023SJohn Marino self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n, 1431*e4b17023SJohn Marino struct loop *loop) 1432*e4b17023SJohn Marino { 1433*e4b17023SJohn Marino tree stride, access_fn; 1434*e4b17023SJohn Marino HOST_WIDE_INT *strides, astride; 1435*e4b17023SJohn Marino VEC (tree, heap) *access_fns; 1436*e4b17023SJohn Marino tree ref = DR_REF (dr); 1437*e4b17023SJohn Marino unsigned i, ret = ~0u; 1438*e4b17023SJohn Marino 1439*e4b17023SJohn Marino /* In the following example: 1440*e4b17023SJohn Marino 1441*e4b17023SJohn Marino for (i = 0; i < N; i++) 1442*e4b17023SJohn Marino for (j = 0; j < N; j++) 1443*e4b17023SJohn Marino use (a[j][i]); 1444*e4b17023SJohn Marino the same cache line is accessed each N steps (except if the change from 1445*e4b17023SJohn Marino i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse, 1446*e4b17023SJohn Marino we cannot rely purely on the results of the data dependence analysis. 1447*e4b17023SJohn Marino 1448*e4b17023SJohn Marino Instead, we compute the stride of the reference in each loop, and consider 1449*e4b17023SJohn Marino the innermost loop in that the stride is less than cache size. */ 1450*e4b17023SJohn Marino 1451*e4b17023SJohn Marino strides = XCNEWVEC (HOST_WIDE_INT, n); 1452*e4b17023SJohn Marino access_fns = DR_ACCESS_FNS (dr); 1453*e4b17023SJohn Marino 1454*e4b17023SJohn Marino FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn) 1455*e4b17023SJohn Marino { 1456*e4b17023SJohn Marino /* Keep track of the reference corresponding to the subscript, so that we 1457*e4b17023SJohn Marino know its stride. */ 1458*e4b17023SJohn Marino while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) 1459*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0); 1460*e4b17023SJohn Marino 1461*e4b17023SJohn Marino if (TREE_CODE (ref) == ARRAY_REF) 1462*e4b17023SJohn Marino { 1463*e4b17023SJohn Marino stride = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1464*e4b17023SJohn Marino if (host_integerp (stride, 1)) 1465*e4b17023SJohn Marino astride = tree_low_cst (stride, 1); 1466*e4b17023SJohn Marino else 1467*e4b17023SJohn Marino astride = L1_CACHE_LINE_SIZE; 1468*e4b17023SJohn Marino 1469*e4b17023SJohn Marino ref = TREE_OPERAND (ref, 0); 1470*e4b17023SJohn Marino } 1471*e4b17023SJohn Marino else 1472*e4b17023SJohn Marino astride = 1; 1473*e4b17023SJohn Marino 1474*e4b17023SJohn Marino add_subscript_strides (access_fn, astride, strides, n, loop); 1475*e4b17023SJohn Marino } 1476*e4b17023SJohn Marino 1477*e4b17023SJohn Marino for (i = n; i-- > 0; ) 1478*e4b17023SJohn Marino { 1479*e4b17023SJohn Marino unsigned HOST_WIDE_INT s; 1480*e4b17023SJohn Marino 1481*e4b17023SJohn Marino s = strides[i] < 0 ? -strides[i] : strides[i]; 1482*e4b17023SJohn Marino 1483*e4b17023SJohn Marino if (s < (unsigned) L1_CACHE_LINE_SIZE 1484*e4b17023SJohn Marino && (loop_sizes[i] 1485*e4b17023SJohn Marino > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION))) 1486*e4b17023SJohn Marino { 1487*e4b17023SJohn Marino ret = loop_sizes[i]; 1488*e4b17023SJohn Marino break; 1489*e4b17023SJohn Marino } 1490*e4b17023SJohn Marino } 1491*e4b17023SJohn Marino 1492*e4b17023SJohn Marino free (strides); 1493*e4b17023SJohn Marino return ret; 1494*e4b17023SJohn Marino } 1495*e4b17023SJohn Marino 1496*e4b17023SJohn Marino /* Determines the distance till the first reuse of each reference in REFS 1497*e4b17023SJohn Marino in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other 1498*e4b17023SJohn Marino memory references in the loop. */ 1499*e4b17023SJohn Marino 1500*e4b17023SJohn Marino static void 1501*e4b17023SJohn Marino determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs, 1502*e4b17023SJohn Marino bool no_other_refs) 1503*e4b17023SJohn Marino { 1504*e4b17023SJohn Marino struct loop *nest, *aloop; 1505*e4b17023SJohn Marino VEC (data_reference_p, heap) *datarefs = NULL; 1506*e4b17023SJohn Marino VEC (ddr_p, heap) *dependences = NULL; 1507*e4b17023SJohn Marino struct mem_ref_group *gr; 1508*e4b17023SJohn Marino struct mem_ref *ref, *refb; 1509*e4b17023SJohn Marino VEC (loop_p, heap) *vloops = NULL; 1510*e4b17023SJohn Marino unsigned *loop_data_size; 1511*e4b17023SJohn Marino unsigned i, j, n; 1512*e4b17023SJohn Marino unsigned volume, dist, adist; 1513*e4b17023SJohn Marino HOST_WIDE_INT vol; 1514*e4b17023SJohn Marino data_reference_p dr; 1515*e4b17023SJohn Marino ddr_p dep; 1516*e4b17023SJohn Marino 1517*e4b17023SJohn Marino if (loop->inner) 1518*e4b17023SJohn Marino return; 1519*e4b17023SJohn Marino 1520*e4b17023SJohn Marino /* Find the outermost loop of the loop nest of loop (we require that 1521*e4b17023SJohn Marino there are no sibling loops inside the nest). */ 1522*e4b17023SJohn Marino nest = loop; 1523*e4b17023SJohn Marino while (1) 1524*e4b17023SJohn Marino { 1525*e4b17023SJohn Marino aloop = loop_outer (nest); 1526*e4b17023SJohn Marino 1527*e4b17023SJohn Marino if (aloop == current_loops->tree_root 1528*e4b17023SJohn Marino || aloop->inner->next) 1529*e4b17023SJohn Marino break; 1530*e4b17023SJohn Marino 1531*e4b17023SJohn Marino nest = aloop; 1532*e4b17023SJohn Marino } 1533*e4b17023SJohn Marino 1534*e4b17023SJohn Marino /* For each loop, determine the amount of data accessed in each iteration. 1535*e4b17023SJohn Marino We use this to estimate whether the reference is evicted from the 1536*e4b17023SJohn Marino cache before its reuse. */ 1537*e4b17023SJohn Marino find_loop_nest (nest, &vloops); 1538*e4b17023SJohn Marino n = VEC_length (loop_p, vloops); 1539*e4b17023SJohn Marino loop_data_size = XNEWVEC (unsigned, n); 1540*e4b17023SJohn Marino volume = volume_of_references (refs); 1541*e4b17023SJohn Marino i = n; 1542*e4b17023SJohn Marino while (i-- != 0) 1543*e4b17023SJohn Marino { 1544*e4b17023SJohn Marino loop_data_size[i] = volume; 1545*e4b17023SJohn Marino /* Bound the volume by the L2 cache size, since above this bound, 1546*e4b17023SJohn Marino all dependence distances are equivalent. */ 1547*e4b17023SJohn Marino if (volume > L2_CACHE_SIZE_BYTES) 1548*e4b17023SJohn Marino continue; 1549*e4b17023SJohn Marino 1550*e4b17023SJohn Marino aloop = VEC_index (loop_p, vloops, i); 1551*e4b17023SJohn Marino vol = max_stmt_executions_int (aloop, false); 1552*e4b17023SJohn Marino if (vol < 0) 1553*e4b17023SJohn Marino vol = expected_loop_iterations (aloop); 1554*e4b17023SJohn Marino volume *= vol; 1555*e4b17023SJohn Marino } 1556*e4b17023SJohn Marino 1557*e4b17023SJohn Marino /* Prepare the references in the form suitable for data dependence 1558*e4b17023SJohn Marino analysis. We ignore unanalyzable data references (the results 1559*e4b17023SJohn Marino are used just as a heuristics to estimate temporality of the 1560*e4b17023SJohn Marino references, hence we do not need to worry about correctness). */ 1561*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next) 1562*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next) 1563*e4b17023SJohn Marino { 1564*e4b17023SJohn Marino dr = create_data_ref (nest, loop_containing_stmt (ref->stmt), 1565*e4b17023SJohn Marino ref->mem, ref->stmt, !ref->write_p); 1566*e4b17023SJohn Marino 1567*e4b17023SJohn Marino if (dr) 1568*e4b17023SJohn Marino { 1569*e4b17023SJohn Marino ref->reuse_distance = volume; 1570*e4b17023SJohn Marino dr->aux = ref; 1571*e4b17023SJohn Marino VEC_safe_push (data_reference_p, heap, datarefs, dr); 1572*e4b17023SJohn Marino } 1573*e4b17023SJohn Marino else 1574*e4b17023SJohn Marino no_other_refs = false; 1575*e4b17023SJohn Marino } 1576*e4b17023SJohn Marino 1577*e4b17023SJohn Marino FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr) 1578*e4b17023SJohn Marino { 1579*e4b17023SJohn Marino dist = self_reuse_distance (dr, loop_data_size, n, loop); 1580*e4b17023SJohn Marino ref = (struct mem_ref *) dr->aux; 1581*e4b17023SJohn Marino if (ref->reuse_distance > dist) 1582*e4b17023SJohn Marino ref->reuse_distance = dist; 1583*e4b17023SJohn Marino 1584*e4b17023SJohn Marino if (no_other_refs) 1585*e4b17023SJohn Marino ref->independent_p = true; 1586*e4b17023SJohn Marino } 1587*e4b17023SJohn Marino 1588*e4b17023SJohn Marino compute_all_dependences (datarefs, &dependences, vloops, true); 1589*e4b17023SJohn Marino 1590*e4b17023SJohn Marino FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep) 1591*e4b17023SJohn Marino { 1592*e4b17023SJohn Marino if (DDR_ARE_DEPENDENT (dep) == chrec_known) 1593*e4b17023SJohn Marino continue; 1594*e4b17023SJohn Marino 1595*e4b17023SJohn Marino ref = (struct mem_ref *) DDR_A (dep)->aux; 1596*e4b17023SJohn Marino refb = (struct mem_ref *) DDR_B (dep)->aux; 1597*e4b17023SJohn Marino 1598*e4b17023SJohn Marino if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know 1599*e4b17023SJohn Marino || DDR_NUM_DIST_VECTS (dep) == 0) 1600*e4b17023SJohn Marino { 1601*e4b17023SJohn Marino /* If the dependence cannot be analyzed, assume that there might be 1602*e4b17023SJohn Marino a reuse. */ 1603*e4b17023SJohn Marino dist = 0; 1604*e4b17023SJohn Marino 1605*e4b17023SJohn Marino ref->independent_p = false; 1606*e4b17023SJohn Marino refb->independent_p = false; 1607*e4b17023SJohn Marino } 1608*e4b17023SJohn Marino else 1609*e4b17023SJohn Marino { 1610*e4b17023SJohn Marino /* The distance vectors are normalized to be always lexicographically 1611*e4b17023SJohn Marino positive, hence we cannot tell just from them whether DDR_A comes 1612*e4b17023SJohn Marino before DDR_B or vice versa. However, it is not important, 1613*e4b17023SJohn Marino anyway -- if DDR_A is close to DDR_B, then it is either reused in 1614*e4b17023SJohn Marino DDR_B (and it is not nontemporal), or it reuses the value of DDR_B 1615*e4b17023SJohn Marino in cache (and marking it as nontemporal would not affect 1616*e4b17023SJohn Marino anything). */ 1617*e4b17023SJohn Marino 1618*e4b17023SJohn Marino dist = volume; 1619*e4b17023SJohn Marino for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++) 1620*e4b17023SJohn Marino { 1621*e4b17023SJohn Marino adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j), 1622*e4b17023SJohn Marino loop_data_size, n); 1623*e4b17023SJohn Marino 1624*e4b17023SJohn Marino /* If this is a dependence in the innermost loop (i.e., the 1625*e4b17023SJohn Marino distances in all superloops are zero) and it is not 1626*e4b17023SJohn Marino the trivial self-dependence with distance zero, record that 1627*e4b17023SJohn Marino the references are not completely independent. */ 1628*e4b17023SJohn Marino if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1) 1629*e4b17023SJohn Marino && (ref != refb 1630*e4b17023SJohn Marino || DDR_DIST_VECT (dep, j)[n-1] != 0)) 1631*e4b17023SJohn Marino { 1632*e4b17023SJohn Marino ref->independent_p = false; 1633*e4b17023SJohn Marino refb->independent_p = false; 1634*e4b17023SJohn Marino } 1635*e4b17023SJohn Marino 1636*e4b17023SJohn Marino /* Ignore accesses closer than 1637*e4b17023SJohn Marino L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 1638*e4b17023SJohn Marino so that we use nontemporal prefetches e.g. if single memory 1639*e4b17023SJohn Marino location is accessed several times in a single iteration of 1640*e4b17023SJohn Marino the loop. */ 1641*e4b17023SJohn Marino if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION) 1642*e4b17023SJohn Marino continue; 1643*e4b17023SJohn Marino 1644*e4b17023SJohn Marino if (adist < dist) 1645*e4b17023SJohn Marino dist = adist; 1646*e4b17023SJohn Marino } 1647*e4b17023SJohn Marino } 1648*e4b17023SJohn Marino 1649*e4b17023SJohn Marino if (ref->reuse_distance > dist) 1650*e4b17023SJohn Marino ref->reuse_distance = dist; 1651*e4b17023SJohn Marino if (refb->reuse_distance > dist) 1652*e4b17023SJohn Marino refb->reuse_distance = dist; 1653*e4b17023SJohn Marino } 1654*e4b17023SJohn Marino 1655*e4b17023SJohn Marino free_dependence_relations (dependences); 1656*e4b17023SJohn Marino free_data_refs (datarefs); 1657*e4b17023SJohn Marino free (loop_data_size); 1658*e4b17023SJohn Marino 1659*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1660*e4b17023SJohn Marino { 1661*e4b17023SJohn Marino fprintf (dump_file, "Reuse distances:\n"); 1662*e4b17023SJohn Marino for (gr = refs; gr; gr = gr->next) 1663*e4b17023SJohn Marino for (ref = gr->refs; ref; ref = ref->next) 1664*e4b17023SJohn Marino fprintf (dump_file, " ref %p distance %u\n", 1665*e4b17023SJohn Marino (void *) ref, ref->reuse_distance); 1666*e4b17023SJohn Marino } 1667*e4b17023SJohn Marino } 1668*e4b17023SJohn Marino 1669*e4b17023SJohn Marino /* Determine whether or not the trip count to ahead ratio is too small based 1670*e4b17023SJohn Marino on prefitablility consideration. 1671*e4b17023SJohn Marino AHEAD: the iteration ahead distance, 1672*e4b17023SJohn Marino EST_NITER: the estimated trip count. */ 1673*e4b17023SJohn Marino 1674*e4b17023SJohn Marino static bool 1675*e4b17023SJohn Marino trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter) 1676*e4b17023SJohn Marino { 1677*e4b17023SJohn Marino /* Assume trip count to ahead ratio is big enough if the trip count could not 1678*e4b17023SJohn Marino be estimated at compile time. */ 1679*e4b17023SJohn Marino if (est_niter < 0) 1680*e4b17023SJohn Marino return false; 1681*e4b17023SJohn Marino 1682*e4b17023SJohn Marino if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead)) 1683*e4b17023SJohn Marino { 1684*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1685*e4b17023SJohn Marino fprintf (dump_file, 1686*e4b17023SJohn Marino "Not prefetching -- loop estimated to roll only %d times\n", 1687*e4b17023SJohn Marino (int) est_niter); 1688*e4b17023SJohn Marino return true; 1689*e4b17023SJohn Marino } 1690*e4b17023SJohn Marino 1691*e4b17023SJohn Marino return false; 1692*e4b17023SJohn Marino } 1693*e4b17023SJohn Marino 1694*e4b17023SJohn Marino /* Determine whether or not the number of memory references in the loop is 1695*e4b17023SJohn Marino reasonable based on the profitablity and compilation time considerations. 1696*e4b17023SJohn Marino NINSNS: estimated number of instructions in the loop, 1697*e4b17023SJohn Marino MEM_REF_COUNT: total number of memory references in the loop. */ 1698*e4b17023SJohn Marino 1699*e4b17023SJohn Marino static bool 1700*e4b17023SJohn Marino mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count) 1701*e4b17023SJohn Marino { 1702*e4b17023SJohn Marino int insn_to_mem_ratio; 1703*e4b17023SJohn Marino 1704*e4b17023SJohn Marino if (mem_ref_count == 0) 1705*e4b17023SJohn Marino return false; 1706*e4b17023SJohn Marino 1707*e4b17023SJohn Marino /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis 1708*e4b17023SJohn Marino (compute_all_dependences) have high costs based on quadratic complexity. 1709*e4b17023SJohn Marino To avoid huge compilation time, we give up prefetching if mem_ref_count 1710*e4b17023SJohn Marino is too large. */ 1711*e4b17023SJohn Marino if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP) 1712*e4b17023SJohn Marino return false; 1713*e4b17023SJohn Marino 1714*e4b17023SJohn Marino /* Prefetching improves performance by overlapping cache missing 1715*e4b17023SJohn Marino memory accesses with CPU operations. If the loop does not have 1716*e4b17023SJohn Marino enough CPU operations to overlap with memory operations, prefetching 1717*e4b17023SJohn Marino won't give a significant benefit. One approximate way of checking 1718*e4b17023SJohn Marino this is to require the ratio of instructions to memory references to 1719*e4b17023SJohn Marino be above a certain limit. This approximation works well in practice. 1720*e4b17023SJohn Marino TODO: Implement a more precise computation by estimating the time 1721*e4b17023SJohn Marino for each CPU or memory op in the loop. Time estimates for memory ops 1722*e4b17023SJohn Marino should account for cache misses. */ 1723*e4b17023SJohn Marino insn_to_mem_ratio = ninsns / mem_ref_count; 1724*e4b17023SJohn Marino 1725*e4b17023SJohn Marino if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO) 1726*e4b17023SJohn Marino { 1727*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1728*e4b17023SJohn Marino fprintf (dump_file, 1729*e4b17023SJohn Marino "Not prefetching -- instruction to memory reference ratio (%d) too small\n", 1730*e4b17023SJohn Marino insn_to_mem_ratio); 1731*e4b17023SJohn Marino return false; 1732*e4b17023SJohn Marino } 1733*e4b17023SJohn Marino 1734*e4b17023SJohn Marino return true; 1735*e4b17023SJohn Marino } 1736*e4b17023SJohn Marino 1737*e4b17023SJohn Marino /* Determine whether or not the instruction to prefetch ratio in the loop is 1738*e4b17023SJohn Marino too small based on the profitablity consideration. 1739*e4b17023SJohn Marino NINSNS: estimated number of instructions in the loop, 1740*e4b17023SJohn Marino PREFETCH_COUNT: an estimate of the number of prefetches, 1741*e4b17023SJohn Marino UNROLL_FACTOR: the factor to unroll the loop if prefetching. */ 1742*e4b17023SJohn Marino 1743*e4b17023SJohn Marino static bool 1744*e4b17023SJohn Marino insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, 1745*e4b17023SJohn Marino unsigned unroll_factor) 1746*e4b17023SJohn Marino { 1747*e4b17023SJohn Marino int insn_to_prefetch_ratio; 1748*e4b17023SJohn Marino 1749*e4b17023SJohn Marino /* Prefetching most likely causes performance degradation when the instruction 1750*e4b17023SJohn Marino to prefetch ratio is too small. Too many prefetch instructions in a loop 1751*e4b17023SJohn Marino may reduce the I-cache performance. 1752*e4b17023SJohn Marino (unroll_factor * ninsns) is used to estimate the number of instructions in 1753*e4b17023SJohn Marino the unrolled loop. This implementation is a bit simplistic -- the number 1754*e4b17023SJohn Marino of issued prefetch instructions is also affected by unrolling. So, 1755*e4b17023SJohn Marino prefetch_mod and the unroll factor should be taken into account when 1756*e4b17023SJohn Marino determining prefetch_count. Also, the number of insns of the unrolled 1757*e4b17023SJohn Marino loop will usually be significantly smaller than the number of insns of the 1758*e4b17023SJohn Marino original loop * unroll_factor (at least the induction variable increases 1759*e4b17023SJohn Marino and the exit branches will get eliminated), so it might be better to use 1760*e4b17023SJohn Marino tree_estimate_loop_size + estimated_unrolled_size. */ 1761*e4b17023SJohn Marino insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count; 1762*e4b17023SJohn Marino if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO) 1763*e4b17023SJohn Marino { 1764*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1765*e4b17023SJohn Marino fprintf (dump_file, 1766*e4b17023SJohn Marino "Not prefetching -- instruction to prefetch ratio (%d) too small\n", 1767*e4b17023SJohn Marino insn_to_prefetch_ratio); 1768*e4b17023SJohn Marino return true; 1769*e4b17023SJohn Marino } 1770*e4b17023SJohn Marino 1771*e4b17023SJohn Marino return false; 1772*e4b17023SJohn Marino } 1773*e4b17023SJohn Marino 1774*e4b17023SJohn Marino 1775*e4b17023SJohn Marino /* Issue prefetch instructions for array references in LOOP. Returns 1776*e4b17023SJohn Marino true if the LOOP was unrolled. */ 1777*e4b17023SJohn Marino 1778*e4b17023SJohn Marino static bool 1779*e4b17023SJohn Marino loop_prefetch_arrays (struct loop *loop) 1780*e4b17023SJohn Marino { 1781*e4b17023SJohn Marino struct mem_ref_group *refs; 1782*e4b17023SJohn Marino unsigned ahead, ninsns, time, unroll_factor; 1783*e4b17023SJohn Marino HOST_WIDE_INT est_niter; 1784*e4b17023SJohn Marino struct tree_niter_desc desc; 1785*e4b17023SJohn Marino bool unrolled = false, no_other_refs; 1786*e4b17023SJohn Marino unsigned prefetch_count; 1787*e4b17023SJohn Marino unsigned mem_ref_count; 1788*e4b17023SJohn Marino 1789*e4b17023SJohn Marino if (optimize_loop_nest_for_size_p (loop)) 1790*e4b17023SJohn Marino { 1791*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1792*e4b17023SJohn Marino fprintf (dump_file, " ignored (cold area)\n"); 1793*e4b17023SJohn Marino return false; 1794*e4b17023SJohn Marino } 1795*e4b17023SJohn Marino 1796*e4b17023SJohn Marino /* FIXME: the time should be weighted by the probabilities of the blocks in 1797*e4b17023SJohn Marino the loop body. */ 1798*e4b17023SJohn Marino time = tree_num_loop_insns (loop, &eni_time_weights); 1799*e4b17023SJohn Marino if (time == 0) 1800*e4b17023SJohn Marino return false; 1801*e4b17023SJohn Marino 1802*e4b17023SJohn Marino ahead = (PREFETCH_LATENCY + time - 1) / time; 1803*e4b17023SJohn Marino est_niter = max_stmt_executions_int (loop, false); 1804*e4b17023SJohn Marino 1805*e4b17023SJohn Marino /* Prefetching is not likely to be profitable if the trip count to ahead 1806*e4b17023SJohn Marino ratio is too small. */ 1807*e4b17023SJohn Marino if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) 1808*e4b17023SJohn Marino return false; 1809*e4b17023SJohn Marino 1810*e4b17023SJohn Marino ninsns = tree_num_loop_insns (loop, &eni_size_weights); 1811*e4b17023SJohn Marino 1812*e4b17023SJohn Marino /* Step 1: gather the memory references. */ 1813*e4b17023SJohn Marino refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); 1814*e4b17023SJohn Marino 1815*e4b17023SJohn Marino /* Give up prefetching if the number of memory references in the 1816*e4b17023SJohn Marino loop is not reasonable based on profitablity and compilation time 1817*e4b17023SJohn Marino considerations. */ 1818*e4b17023SJohn Marino if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count)) 1819*e4b17023SJohn Marino goto fail; 1820*e4b17023SJohn Marino 1821*e4b17023SJohn Marino /* Step 2: estimate the reuse effects. */ 1822*e4b17023SJohn Marino prune_by_reuse (refs); 1823*e4b17023SJohn Marino 1824*e4b17023SJohn Marino if (nothing_to_prefetch_p (refs)) 1825*e4b17023SJohn Marino goto fail; 1826*e4b17023SJohn Marino 1827*e4b17023SJohn Marino determine_loop_nest_reuse (loop, refs, no_other_refs); 1828*e4b17023SJohn Marino 1829*e4b17023SJohn Marino /* Step 3: determine unroll factor. */ 1830*e4b17023SJohn Marino unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, 1831*e4b17023SJohn Marino est_niter); 1832*e4b17023SJohn Marino 1833*e4b17023SJohn Marino /* Estimate prefetch count for the unrolled loop. */ 1834*e4b17023SJohn Marino prefetch_count = estimate_prefetch_count (refs, unroll_factor); 1835*e4b17023SJohn Marino if (prefetch_count == 0) 1836*e4b17023SJohn Marino goto fail; 1837*e4b17023SJohn Marino 1838*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1839*e4b17023SJohn Marino fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " 1840*e4b17023SJohn Marino HOST_WIDE_INT_PRINT_DEC "\n" 1841*e4b17023SJohn Marino "insn count %d, mem ref count %d, prefetch count %d\n", 1842*e4b17023SJohn Marino ahead, unroll_factor, est_niter, 1843*e4b17023SJohn Marino ninsns, mem_ref_count, prefetch_count); 1844*e4b17023SJohn Marino 1845*e4b17023SJohn Marino /* Prefetching is not likely to be profitable if the instruction to prefetch 1846*e4b17023SJohn Marino ratio is too small. */ 1847*e4b17023SJohn Marino if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count, 1848*e4b17023SJohn Marino unroll_factor)) 1849*e4b17023SJohn Marino goto fail; 1850*e4b17023SJohn Marino 1851*e4b17023SJohn Marino mark_nontemporal_stores (loop, refs); 1852*e4b17023SJohn Marino 1853*e4b17023SJohn Marino /* Step 4: what to prefetch? */ 1854*e4b17023SJohn Marino if (!schedule_prefetches (refs, unroll_factor, ahead)) 1855*e4b17023SJohn Marino goto fail; 1856*e4b17023SJohn Marino 1857*e4b17023SJohn Marino /* Step 5: unroll the loop. TODO -- peeling of first and last few 1858*e4b17023SJohn Marino iterations so that we do not issue superfluous prefetches. */ 1859*e4b17023SJohn Marino if (unroll_factor != 1) 1860*e4b17023SJohn Marino { 1861*e4b17023SJohn Marino tree_unroll_loop (loop, unroll_factor, 1862*e4b17023SJohn Marino single_dom_exit (loop), &desc); 1863*e4b17023SJohn Marino unrolled = true; 1864*e4b17023SJohn Marino } 1865*e4b17023SJohn Marino 1866*e4b17023SJohn Marino /* Step 6: issue the prefetches. */ 1867*e4b17023SJohn Marino issue_prefetches (refs, unroll_factor, ahead); 1868*e4b17023SJohn Marino 1869*e4b17023SJohn Marino fail: 1870*e4b17023SJohn Marino release_mem_refs (refs); 1871*e4b17023SJohn Marino return unrolled; 1872*e4b17023SJohn Marino } 1873*e4b17023SJohn Marino 1874*e4b17023SJohn Marino /* Issue prefetch instructions for array references in loops. */ 1875*e4b17023SJohn Marino 1876*e4b17023SJohn Marino unsigned int 1877*e4b17023SJohn Marino tree_ssa_prefetch_arrays (void) 1878*e4b17023SJohn Marino { 1879*e4b17023SJohn Marino loop_iterator li; 1880*e4b17023SJohn Marino struct loop *loop; 1881*e4b17023SJohn Marino bool unrolled = false; 1882*e4b17023SJohn Marino int todo_flags = 0; 1883*e4b17023SJohn Marino 1884*e4b17023SJohn Marino if (!HAVE_prefetch 1885*e4b17023SJohn Marino /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. 1886*e4b17023SJohn Marino -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part 1887*e4b17023SJohn Marino of processor costs and i486 does not have prefetch, but 1888*e4b17023SJohn Marino -march=pentium4 causes HAVE_prefetch to be true. Ugh. */ 1889*e4b17023SJohn Marino || PREFETCH_BLOCK == 0) 1890*e4b17023SJohn Marino return 0; 1891*e4b17023SJohn Marino 1892*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1893*e4b17023SJohn Marino { 1894*e4b17023SJohn Marino fprintf (dump_file, "Prefetching parameters:\n"); 1895*e4b17023SJohn Marino fprintf (dump_file, " simultaneous prefetches: %d\n", 1896*e4b17023SJohn Marino SIMULTANEOUS_PREFETCHES); 1897*e4b17023SJohn Marino fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY); 1898*e4b17023SJohn Marino fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); 1899*e4b17023SJohn Marino fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", 1900*e4b17023SJohn Marino L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE); 1901*e4b17023SJohn Marino fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); 1902*e4b17023SJohn Marino fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE); 1903*e4b17023SJohn Marino fprintf (dump_file, " min insn-to-prefetch ratio: %d \n", 1904*e4b17023SJohn Marino MIN_INSN_TO_PREFETCH_RATIO); 1905*e4b17023SJohn Marino fprintf (dump_file, " min insn-to-mem ratio: %d \n", 1906*e4b17023SJohn Marino PREFETCH_MIN_INSN_TO_MEM_RATIO); 1907*e4b17023SJohn Marino fprintf (dump_file, "\n"); 1908*e4b17023SJohn Marino } 1909*e4b17023SJohn Marino 1910*e4b17023SJohn Marino initialize_original_copy_tables (); 1911*e4b17023SJohn Marino 1912*e4b17023SJohn Marino if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) 1913*e4b17023SJohn Marino { 1914*e4b17023SJohn Marino tree type = build_function_type_list (void_type_node, 1915*e4b17023SJohn Marino const_ptr_type_node, NULL_TREE); 1916*e4b17023SJohn Marino tree decl = add_builtin_function ("__builtin_prefetch", type, 1917*e4b17023SJohn Marino BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 1918*e4b17023SJohn Marino NULL, NULL_TREE); 1919*e4b17023SJohn Marino DECL_IS_NOVOPS (decl) = true; 1920*e4b17023SJohn Marino set_builtin_decl (BUILT_IN_PREFETCH, decl, false); 1921*e4b17023SJohn Marino } 1922*e4b17023SJohn Marino 1923*e4b17023SJohn Marino /* We assume that size of cache line is a power of two, so verify this 1924*e4b17023SJohn Marino here. */ 1925*e4b17023SJohn Marino gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0); 1926*e4b17023SJohn Marino 1927*e4b17023SJohn Marino FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST) 1928*e4b17023SJohn Marino { 1929*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1930*e4b17023SJohn Marino fprintf (dump_file, "Processing loop %d:\n", loop->num); 1931*e4b17023SJohn Marino 1932*e4b17023SJohn Marino unrolled |= loop_prefetch_arrays (loop); 1933*e4b17023SJohn Marino 1934*e4b17023SJohn Marino if (dump_file && (dump_flags & TDF_DETAILS)) 1935*e4b17023SJohn Marino fprintf (dump_file, "\n\n"); 1936*e4b17023SJohn Marino } 1937*e4b17023SJohn Marino 1938*e4b17023SJohn Marino if (unrolled) 1939*e4b17023SJohn Marino { 1940*e4b17023SJohn Marino scev_reset (); 1941*e4b17023SJohn Marino todo_flags |= TODO_cleanup_cfg; 1942*e4b17023SJohn Marino } 1943*e4b17023SJohn Marino 1944*e4b17023SJohn Marino free_original_copy_tables (); 1945*e4b17023SJohn Marino return todo_flags; 1946*e4b17023SJohn Marino } 1947