xref: /dflybsd-src/contrib/gcc-8.0/gcc/tree-ssa-loop-prefetch.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Array prefetching.
2*38fd1498Szrj    Copyright (C) 2005-2018 Free Software Foundation, Inc.
3*38fd1498Szrj 
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj 
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it
7*38fd1498Szrj under the terms of the GNU General Public License as published by the
8*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
9*38fd1498Szrj later version.
10*38fd1498Szrj 
11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
12*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14*38fd1498Szrj for more details.
15*38fd1498Szrj 
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
19*38fd1498Szrj 
20*38fd1498Szrj #include "config.h"
21*38fd1498Szrj #include "system.h"
22*38fd1498Szrj #include "coretypes.h"
23*38fd1498Szrj #include "backend.h"
24*38fd1498Szrj #include "target.h"
25*38fd1498Szrj #include "rtl.h"
26*38fd1498Szrj #include "tree.h"
27*38fd1498Szrj #include "gimple.h"
28*38fd1498Szrj #include "predict.h"
29*38fd1498Szrj #include "tree-pass.h"
30*38fd1498Szrj #include "gimple-ssa.h"
31*38fd1498Szrj #include "optabs-query.h"
32*38fd1498Szrj #include "tree-pretty-print.h"
33*38fd1498Szrj #include "fold-const.h"
34*38fd1498Szrj #include "stor-layout.h"
35*38fd1498Szrj #include "gimplify.h"
36*38fd1498Szrj #include "gimple-iterator.h"
37*38fd1498Szrj #include "gimplify-me.h"
38*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
39*38fd1498Szrj #include "tree-ssa-loop-manip.h"
40*38fd1498Szrj #include "tree-ssa-loop-niter.h"
41*38fd1498Szrj #include "tree-ssa-loop.h"
42*38fd1498Szrj #include "ssa.h"
43*38fd1498Szrj #include "tree-into-ssa.h"
44*38fd1498Szrj #include "cfgloop.h"
45*38fd1498Szrj #include "tree-scalar-evolution.h"
46*38fd1498Szrj #include "params.h"
47*38fd1498Szrj #include "langhooks.h"
48*38fd1498Szrj #include "tree-inline.h"
49*38fd1498Szrj #include "tree-data-ref.h"
50*38fd1498Szrj #include "diagnostic-core.h"
51*38fd1498Szrj #include "dbgcnt.h"
52*38fd1498Szrj 
53*38fd1498Szrj /* This pass inserts prefetch instructions to optimize cache usage during
54*38fd1498Szrj    accesses to arrays in loops.  It processes loops sequentially and:
55*38fd1498Szrj 
56*38fd1498Szrj    1) Gathers all memory references in the single loop.
57*38fd1498Szrj    2) For each of the references it decides when it is profitable to prefetch
58*38fd1498Szrj       it.  To do it, we evaluate the reuse among the accesses, and determines
59*38fd1498Szrj       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
60*38fd1498Szrj       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
61*38fd1498Szrj       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
62*38fd1498Szrj       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
63*38fd1498Szrj       (assuming cache line size is 64 bytes, char has size 1 byte and there
64*38fd1498Szrj       is no hardware sequential prefetch):
65*38fd1498Szrj 
66*38fd1498Szrj       char *a;
67*38fd1498Szrj       for (i = 0; i < max; i++)
68*38fd1498Szrj 	{
69*38fd1498Szrj 	  a[255] = ...;		(0)
70*38fd1498Szrj 	  a[i] = ...;		(1)
71*38fd1498Szrj 	  a[i + 64] = ...;	(2)
72*38fd1498Szrj 	  a[16*i] = ...;	(3)
73*38fd1498Szrj 	  a[187*i] = ...;	(4)
74*38fd1498Szrj 	  a[187*i + 50] = ...;	(5)
75*38fd1498Szrj 	}
76*38fd1498Szrj 
77*38fd1498Szrj        (0) obviously has PREFETCH_BEFORE 1
78*38fd1498Szrj        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
79*38fd1498Szrj            location 64 iterations before it, and PREFETCH_MOD 64 (since
80*38fd1498Szrj 	   it hits the same cache line otherwise).
81*38fd1498Szrj        (2) has PREFETCH_MOD 64
82*38fd1498Szrj        (3) has PREFETCH_MOD 4
83*38fd1498Szrj        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
84*38fd1498Szrj            the cache line accessed by (5) is the same with probability only
85*38fd1498Szrj 	   7/32.
86*38fd1498Szrj        (5) has PREFETCH_MOD 1 as well.
87*38fd1498Szrj 
88*38fd1498Szrj       Additionally, we use data dependence analysis to determine for each
89*38fd1498Szrj       reference the distance till the first reuse; this information is used
90*38fd1498Szrj       to determine the temporality of the issued prefetch instruction.
91*38fd1498Szrj 
92*38fd1498Szrj    3) We determine how much ahead we need to prefetch.  The number of
93*38fd1498Szrj       iterations needed is time to fetch / time spent in one iteration of
94*38fd1498Szrj       the loop.  The problem is that we do not know either of these values,
95*38fd1498Szrj       so we just make a heuristic guess based on a magic (possibly)
96*38fd1498Szrj       target-specific constant and size of the loop.
97*38fd1498Szrj 
98*38fd1498Szrj    4) Determine which of the references we prefetch.  We take into account
99*38fd1498Szrj       that there is a maximum number of simultaneous prefetches (provided
100*38fd1498Szrj       by machine description).  We prefetch as many prefetches as possible
101*38fd1498Szrj       while still within this bound (starting with those with lowest
102*38fd1498Szrj       prefetch_mod, since they are responsible for most of the cache
103*38fd1498Szrj       misses).
104*38fd1498Szrj 
105*38fd1498Szrj    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
106*38fd1498Szrj       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
107*38fd1498Szrj       prefetching nonaccessed memory.
108*38fd1498Szrj       TODO -- actually implement peeling.
109*38fd1498Szrj 
110*38fd1498Szrj    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
111*38fd1498Szrj       prefetch instructions with guards in cases where 5) was not sufficient
112*38fd1498Szrj       to satisfy the constraints?
113*38fd1498Szrj 
114*38fd1498Szrj    A cost model is implemented to determine whether or not prefetching is
115*38fd1498Szrj    profitable for a given loop.  The cost model has three heuristics:
116*38fd1498Szrj 
117*38fd1498Szrj    1. Function trip_count_to_ahead_ratio_too_small_p implements a
118*38fd1498Szrj       heuristic that determines whether or not the loop has too few
119*38fd1498Szrj       iterations (compared to ahead).  Prefetching is not likely to be
120*38fd1498Szrj       beneficial if the trip count to ahead ratio is below a certain
121*38fd1498Szrj       minimum.
122*38fd1498Szrj 
123*38fd1498Szrj    2. Function mem_ref_count_reasonable_p implements a heuristic that
124*38fd1498Szrj       determines whether the given loop has enough CPU ops that can be
125*38fd1498Szrj       overlapped with cache missing memory ops.  If not, the loop
126*38fd1498Szrj       won't benefit from prefetching.  In the implementation,
127*38fd1498Szrj       prefetching is not considered beneficial if the ratio between
128*38fd1498Szrj       the instruction count and the mem ref count is below a certain
129*38fd1498Szrj       minimum.
130*38fd1498Szrj 
131*38fd1498Szrj    3. Function insn_to_prefetch_ratio_too_small_p implements a
132*38fd1498Szrj       heuristic that disables prefetching in a loop if the prefetching
133*38fd1498Szrj       cost is above a certain limit.  The relative prefetching cost is
134*38fd1498Szrj       estimated by taking the ratio between the prefetch count and the
135*38fd1498Szrj       total intruction count (this models the I-cache cost).
136*38fd1498Szrj 
137*38fd1498Szrj    The limits used in these heuristics are defined as parameters with
138*38fd1498Szrj    reasonable default values. Machine-specific default values will be
139*38fd1498Szrj    added later.
140*38fd1498Szrj 
141*38fd1498Szrj    Some other TODO:
142*38fd1498Szrj       -- write and use more general reuse analysis (that could be also used
143*38fd1498Szrj 	 in other cache aimed loop optimizations)
144*38fd1498Szrj       -- make it behave sanely together with the prefetches given by user
145*38fd1498Szrj 	 (now we just ignore them; at the very least we should avoid
146*38fd1498Szrj 	 optimizing loops in that user put his own prefetches)
147*38fd1498Szrj       -- we assume cache line size alignment of arrays; this could be
148*38fd1498Szrj 	 improved.  */
149*38fd1498Szrj 
150*38fd1498Szrj /* Magic constants follow.  These should be replaced by machine specific
151*38fd1498Szrj    numbers.  */
152*38fd1498Szrj 
153*38fd1498Szrj /* True if write can be prefetched by a read prefetch.  */
154*38fd1498Szrj 
155*38fd1498Szrj #ifndef WRITE_CAN_USE_READ_PREFETCH
156*38fd1498Szrj #define WRITE_CAN_USE_READ_PREFETCH 1
157*38fd1498Szrj #endif
158*38fd1498Szrj 
159*38fd1498Szrj /* True if read can be prefetched by a write prefetch. */
160*38fd1498Szrj 
161*38fd1498Szrj #ifndef READ_CAN_USE_WRITE_PREFETCH
162*38fd1498Szrj #define READ_CAN_USE_WRITE_PREFETCH 0
163*38fd1498Szrj #endif
164*38fd1498Szrj 
165*38fd1498Szrj /* The size of the block loaded by a single prefetch.  Usually, this is
166*38fd1498Szrj    the same as cache line size (at the moment, we only consider one level
167*38fd1498Szrj    of cache hierarchy).  */
168*38fd1498Szrj 
169*38fd1498Szrj #ifndef PREFETCH_BLOCK
170*38fd1498Szrj #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
171*38fd1498Szrj #endif
172*38fd1498Szrj 
173*38fd1498Szrj /* Do we have a forward hardware sequential prefetching?  */
174*38fd1498Szrj 
175*38fd1498Szrj #ifndef HAVE_FORWARD_PREFETCH
176*38fd1498Szrj #define HAVE_FORWARD_PREFETCH 0
177*38fd1498Szrj #endif
178*38fd1498Szrj 
179*38fd1498Szrj /* Do we have a backward hardware sequential prefetching?  */
180*38fd1498Szrj 
181*38fd1498Szrj #ifndef HAVE_BACKWARD_PREFETCH
182*38fd1498Szrj #define HAVE_BACKWARD_PREFETCH 0
183*38fd1498Szrj #endif
184*38fd1498Szrj 
185*38fd1498Szrj /* In some cases we are only able to determine that there is a certain
186*38fd1498Szrj    probability that the two accesses hit the same cache line.  In this
187*38fd1498Szrj    case, we issue the prefetches for both of them if this probability
188*38fd1498Szrj    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
189*38fd1498Szrj 
190*38fd1498Szrj #ifndef ACCEPTABLE_MISS_RATE
191*38fd1498Szrj #define ACCEPTABLE_MISS_RATE 50
192*38fd1498Szrj #endif
193*38fd1498Szrj 
194*38fd1498Szrj #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
195*38fd1498Szrj #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
196*38fd1498Szrj 
197*38fd1498Szrj /* We consider a memory access nontemporal if it is not reused sooner than
198*38fd1498Szrj    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
199*38fd1498Szrj    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
200*38fd1498Szrj    so that we use nontemporal prefetches e.g. if single memory location
201*38fd1498Szrj    is accessed several times in a single iteration of the loop.  */
202*38fd1498Szrj #define NONTEMPORAL_FRACTION 16
203*38fd1498Szrj 
204*38fd1498Szrj /* In case we have to emit a memory fence instruction after the loop that
205*38fd1498Szrj    uses nontemporal stores, this defines the builtin to use.  */
206*38fd1498Szrj 
207*38fd1498Szrj #ifndef FENCE_FOLLOWING_MOVNT
208*38fd1498Szrj #define FENCE_FOLLOWING_MOVNT NULL_TREE
209*38fd1498Szrj #endif
210*38fd1498Szrj 
211*38fd1498Szrj /* It is not profitable to prefetch when the trip count is not at
212*38fd1498Szrj    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
213*38fd1498Szrj    For example, in a loop with a prefetch ahead distance of 10,
214*38fd1498Szrj    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
215*38fd1498Szrj    profitable to prefetch when the trip count is greater or equal to
216*38fd1498Szrj    40.  In that case, 30 out of the 40 iterations will benefit from
217*38fd1498Szrj    prefetching.  */
218*38fd1498Szrj 
219*38fd1498Szrj #ifndef TRIP_COUNT_TO_AHEAD_RATIO
220*38fd1498Szrj #define TRIP_COUNT_TO_AHEAD_RATIO 4
221*38fd1498Szrj #endif
222*38fd1498Szrj 
223*38fd1498Szrj /* The group of references between that reuse may occur.  */
224*38fd1498Szrj 
225*38fd1498Szrj struct mem_ref_group
226*38fd1498Szrj {
227*38fd1498Szrj   tree base;			/* Base of the reference.  */
228*38fd1498Szrj   tree step;			/* Step of the reference.  */
229*38fd1498Szrj   struct mem_ref *refs;		/* References in the group.  */
230*38fd1498Szrj   struct mem_ref_group *next;	/* Next group of references.  */
231*38fd1498Szrj   unsigned int uid;		/* Group UID, used only for debugging.  */
232*38fd1498Szrj };
233*38fd1498Szrj 
234*38fd1498Szrj /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
235*38fd1498Szrj 
236*38fd1498Szrj #define PREFETCH_ALL		HOST_WIDE_INT_M1U
237*38fd1498Szrj 
238*38fd1498Szrj /* Do not generate a prefetch if the unroll factor is significantly less
239*38fd1498Szrj    than what is required by the prefetch.  This is to avoid redundant
240*38fd1498Szrj    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
241*38fd1498Szrj    2, prefetching requires unrolling the loop 16 times, but
242*38fd1498Szrj    the loop is actually unrolled twice.  In this case (ratio = 8),
243*38fd1498Szrj    prefetching is not likely to be beneficial.  */
244*38fd1498Szrj 
245*38fd1498Szrj #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
246*38fd1498Szrj #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
247*38fd1498Szrj #endif
248*38fd1498Szrj 
249*38fd1498Szrj /* Some of the prefetch computations have quadratic complexity.  We want to
250*38fd1498Szrj    avoid huge compile times and, therefore, want to limit the amount of
251*38fd1498Szrj    memory references per loop where we consider prefetching.  */
252*38fd1498Szrj 
253*38fd1498Szrj #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
254*38fd1498Szrj #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
255*38fd1498Szrj #endif
256*38fd1498Szrj 
257*38fd1498Szrj /* The memory reference.  */
258*38fd1498Szrj 
259*38fd1498Szrj struct mem_ref
260*38fd1498Szrj {
261*38fd1498Szrj   gimple *stmt;			/* Statement in that the reference appears.  */
262*38fd1498Szrj   tree mem;			/* The reference.  */
263*38fd1498Szrj   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
264*38fd1498Szrj   struct mem_ref_group *group;	/* The group of references it belongs to.  */
265*38fd1498Szrj   unsigned HOST_WIDE_INT prefetch_mod;
266*38fd1498Szrj 				/* Prefetch only each PREFETCH_MOD-th
267*38fd1498Szrj 				   iteration.  */
268*38fd1498Szrj   unsigned HOST_WIDE_INT prefetch_before;
269*38fd1498Szrj 				/* Prefetch only first PREFETCH_BEFORE
270*38fd1498Szrj 				   iterations.  */
271*38fd1498Szrj   unsigned reuse_distance;	/* The amount of data accessed before the first
272*38fd1498Szrj 				   reuse of this value.  */
273*38fd1498Szrj   struct mem_ref *next;		/* The next reference in the group.  */
274*38fd1498Szrj   unsigned int uid;		/* Ref UID, used only for debugging.  */
275*38fd1498Szrj   unsigned write_p : 1;		/* Is it a write?  */
276*38fd1498Szrj   unsigned independent_p : 1;	/* True if the reference is independent on
277*38fd1498Szrj 				   all other references inside the loop.  */
278*38fd1498Szrj   unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
279*38fd1498Szrj   unsigned storent_p : 1;	/* True if we changed the store to a
280*38fd1498Szrj 				   nontemporal one.  */
281*38fd1498Szrj };
282*38fd1498Szrj 
283*38fd1498Szrj /* Dumps information about memory reference */
284*38fd1498Szrj static void
dump_mem_details(FILE * file,tree base,tree step,HOST_WIDE_INT delta,bool write_p)285*38fd1498Szrj dump_mem_details (FILE *file, tree base, tree step,
286*38fd1498Szrj 	    HOST_WIDE_INT delta, bool write_p)
287*38fd1498Szrj {
288*38fd1498Szrj   fprintf (file, "(base ");
289*38fd1498Szrj   print_generic_expr (file, base, TDF_SLIM);
290*38fd1498Szrj   fprintf (file, ", step ");
291*38fd1498Szrj   if (cst_and_fits_in_hwi (step))
292*38fd1498Szrj     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
293*38fd1498Szrj   else
294*38fd1498Szrj     print_generic_expr (file, step, TDF_SLIM);
295*38fd1498Szrj   fprintf (file, ")\n");
296*38fd1498Szrj   fprintf (file, "  delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
297*38fd1498Szrj   fprintf (file, "  %s\n\n", write_p ? "write" : "read");
298*38fd1498Szrj }
299*38fd1498Szrj 
300*38fd1498Szrj /* Dumps information about reference REF to FILE.  */
301*38fd1498Szrj 
302*38fd1498Szrj static void
dump_mem_ref(FILE * file,struct mem_ref * ref)303*38fd1498Szrj dump_mem_ref (FILE *file, struct mem_ref *ref)
304*38fd1498Szrj {
305*38fd1498Szrj   fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
306*38fd1498Szrj   print_generic_expr (file, ref->mem, TDF_SLIM);
307*38fd1498Szrj   fprintf (file, ")\n");
308*38fd1498Szrj }
309*38fd1498Szrj 
310*38fd1498Szrj /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
311*38fd1498Szrj    exist.  */
312*38fd1498Szrj 
313*38fd1498Szrj static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)314*38fd1498Szrj find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
315*38fd1498Szrj {
316*38fd1498Szrj   /* Global count for setting struct mem_ref_group->uid.  */
317*38fd1498Szrj   static unsigned int last_mem_ref_group_uid = 0;
318*38fd1498Szrj 
319*38fd1498Szrj   struct mem_ref_group *group;
320*38fd1498Szrj 
321*38fd1498Szrj   for (; *groups; groups = &(*groups)->next)
322*38fd1498Szrj     {
323*38fd1498Szrj       if (operand_equal_p ((*groups)->step, step, 0)
324*38fd1498Szrj 	  && operand_equal_p ((*groups)->base, base, 0))
325*38fd1498Szrj 	return *groups;
326*38fd1498Szrj 
327*38fd1498Szrj       /* If step is an integer constant, keep the list of groups sorted
328*38fd1498Szrj          by decreasing step.  */
329*38fd1498Szrj       if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
330*38fd1498Szrj 	  && int_cst_value ((*groups)->step) < int_cst_value (step))
331*38fd1498Szrj 	break;
332*38fd1498Szrj     }
333*38fd1498Szrj 
334*38fd1498Szrj   group = XNEW (struct mem_ref_group);
335*38fd1498Szrj   group->base = base;
336*38fd1498Szrj   group->step = step;
337*38fd1498Szrj   group->refs = NULL;
338*38fd1498Szrj   group->uid = ++last_mem_ref_group_uid;
339*38fd1498Szrj   group->next = *groups;
340*38fd1498Szrj   *groups = group;
341*38fd1498Szrj 
342*38fd1498Szrj   return group;
343*38fd1498Szrj }
344*38fd1498Szrj 
345*38fd1498Szrj /* Records a memory reference MEM in GROUP with offset DELTA and write status
346*38fd1498Szrj    WRITE_P.  The reference occurs in statement STMT.  */
347*38fd1498Szrj 
348*38fd1498Szrj static void
record_ref(struct mem_ref_group * group,gimple * stmt,tree mem,HOST_WIDE_INT delta,bool write_p)349*38fd1498Szrj record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
350*38fd1498Szrj 	    HOST_WIDE_INT delta, bool write_p)
351*38fd1498Szrj {
352*38fd1498Szrj   unsigned int last_mem_ref_uid = 0;
353*38fd1498Szrj   struct mem_ref **aref;
354*38fd1498Szrj 
355*38fd1498Szrj   /* Do not record the same address twice.  */
356*38fd1498Szrj   for (aref = &group->refs; *aref; aref = &(*aref)->next)
357*38fd1498Szrj     {
358*38fd1498Szrj       last_mem_ref_uid = (*aref)->uid;
359*38fd1498Szrj 
360*38fd1498Szrj       /* It does not have to be possible for write reference to reuse the read
361*38fd1498Szrj 	 prefetch, or vice versa.  */
362*38fd1498Szrj       if (!WRITE_CAN_USE_READ_PREFETCH
363*38fd1498Szrj 	  && write_p
364*38fd1498Szrj 	  && !(*aref)->write_p)
365*38fd1498Szrj 	continue;
366*38fd1498Szrj       if (!READ_CAN_USE_WRITE_PREFETCH
367*38fd1498Szrj 	  && !write_p
368*38fd1498Szrj 	  && (*aref)->write_p)
369*38fd1498Szrj 	continue;
370*38fd1498Szrj 
371*38fd1498Szrj       if ((*aref)->delta == delta)
372*38fd1498Szrj 	return;
373*38fd1498Szrj     }
374*38fd1498Szrj 
375*38fd1498Szrj   (*aref) = XNEW (struct mem_ref);
376*38fd1498Szrj   (*aref)->stmt = stmt;
377*38fd1498Szrj   (*aref)->mem = mem;
378*38fd1498Szrj   (*aref)->delta = delta;
379*38fd1498Szrj   (*aref)->write_p = write_p;
380*38fd1498Szrj   (*aref)->prefetch_before = PREFETCH_ALL;
381*38fd1498Szrj   (*aref)->prefetch_mod = 1;
382*38fd1498Szrj   (*aref)->reuse_distance = 0;
383*38fd1498Szrj   (*aref)->issue_prefetch_p = false;
384*38fd1498Szrj   (*aref)->group = group;
385*38fd1498Szrj   (*aref)->next = NULL;
386*38fd1498Szrj   (*aref)->independent_p = false;
387*38fd1498Szrj   (*aref)->storent_p = false;
388*38fd1498Szrj   (*aref)->uid = last_mem_ref_uid + 1;
389*38fd1498Szrj 
390*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
391*38fd1498Szrj     {
392*38fd1498Szrj       dump_mem_ref (dump_file, *aref);
393*38fd1498Szrj 
394*38fd1498Szrj       fprintf (dump_file, "  group %u ", group->uid);
395*38fd1498Szrj       dump_mem_details (dump_file, group->base, group->step, delta,
396*38fd1498Szrj 			write_p);
397*38fd1498Szrj     }
398*38fd1498Szrj }
399*38fd1498Szrj 
400*38fd1498Szrj /* Release memory references in GROUPS.  */
401*38fd1498Szrj 
402*38fd1498Szrj static void
release_mem_refs(struct mem_ref_group * groups)403*38fd1498Szrj release_mem_refs (struct mem_ref_group *groups)
404*38fd1498Szrj {
405*38fd1498Szrj   struct mem_ref_group *next_g;
406*38fd1498Szrj   struct mem_ref *ref, *next_r;
407*38fd1498Szrj 
408*38fd1498Szrj   for (; groups; groups = next_g)
409*38fd1498Szrj     {
410*38fd1498Szrj       next_g = groups->next;
411*38fd1498Szrj       for (ref = groups->refs; ref; ref = next_r)
412*38fd1498Szrj 	{
413*38fd1498Szrj 	  next_r = ref->next;
414*38fd1498Szrj 	  free (ref);
415*38fd1498Szrj 	}
416*38fd1498Szrj       free (groups);
417*38fd1498Szrj     }
418*38fd1498Szrj }
419*38fd1498Szrj 
420*38fd1498Szrj /* A structure used to pass arguments to idx_analyze_ref.  */
421*38fd1498Szrj 
422*38fd1498Szrj struct ar_data
423*38fd1498Szrj {
424*38fd1498Szrj   struct loop *loop;			/* Loop of the reference.  */
425*38fd1498Szrj   gimple *stmt;				/* Statement of the reference.  */
426*38fd1498Szrj   tree *step;				/* Step of the memory reference.  */
427*38fd1498Szrj   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
428*38fd1498Szrj };
429*38fd1498Szrj 
430*38fd1498Szrj /* Analyzes a single INDEX of a memory reference to obtain information
431*38fd1498Szrj    described at analyze_ref.  Callback for for_each_index.  */
432*38fd1498Szrj 
433*38fd1498Szrj static bool
idx_analyze_ref(tree base,tree * index,void * data)434*38fd1498Szrj idx_analyze_ref (tree base, tree *index, void *data)
435*38fd1498Szrj {
436*38fd1498Szrj   struct ar_data *ar_data = (struct ar_data *) data;
437*38fd1498Szrj   tree ibase, step, stepsize;
438*38fd1498Szrj   HOST_WIDE_INT idelta = 0, imult = 1;
439*38fd1498Szrj   affine_iv iv;
440*38fd1498Szrj 
441*38fd1498Szrj   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
442*38fd1498Szrj 		  *index, &iv, true))
443*38fd1498Szrj     return false;
444*38fd1498Szrj   ibase = iv.base;
445*38fd1498Szrj   step = iv.step;
446*38fd1498Szrj 
447*38fd1498Szrj   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
448*38fd1498Szrj       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
449*38fd1498Szrj     {
450*38fd1498Szrj       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
451*38fd1498Szrj       ibase = TREE_OPERAND (ibase, 0);
452*38fd1498Szrj     }
453*38fd1498Szrj   if (cst_and_fits_in_hwi (ibase))
454*38fd1498Szrj     {
455*38fd1498Szrj       idelta += int_cst_value (ibase);
456*38fd1498Szrj       ibase = build_int_cst (TREE_TYPE (ibase), 0);
457*38fd1498Szrj     }
458*38fd1498Szrj 
459*38fd1498Szrj   if (TREE_CODE (base) == ARRAY_REF)
460*38fd1498Szrj     {
461*38fd1498Szrj       stepsize = array_ref_element_size (base);
462*38fd1498Szrj       if (!cst_and_fits_in_hwi (stepsize))
463*38fd1498Szrj 	return false;
464*38fd1498Szrj       imult = int_cst_value (stepsize);
465*38fd1498Szrj       step = fold_build2 (MULT_EXPR, sizetype,
466*38fd1498Szrj 			  fold_convert (sizetype, step),
467*38fd1498Szrj 			  fold_convert (sizetype, stepsize));
468*38fd1498Szrj       idelta *= imult;
469*38fd1498Szrj     }
470*38fd1498Szrj 
471*38fd1498Szrj   if (*ar_data->step == NULL_TREE)
472*38fd1498Szrj     *ar_data->step = step;
473*38fd1498Szrj   else
474*38fd1498Szrj     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
475*38fd1498Szrj 				  fold_convert (sizetype, *ar_data->step),
476*38fd1498Szrj 				  fold_convert (sizetype, step));
477*38fd1498Szrj   *ar_data->delta += idelta;
478*38fd1498Szrj   *index = ibase;
479*38fd1498Szrj 
480*38fd1498Szrj   return true;
481*38fd1498Szrj }
482*38fd1498Szrj 
483*38fd1498Szrj /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
484*38fd1498Szrj    STEP are integer constants and iter is number of iterations of LOOP.  The
485*38fd1498Szrj    reference occurs in statement STMT.  Strips nonaddressable component
486*38fd1498Szrj    references from REF_P.  */
487*38fd1498Szrj 
488*38fd1498Szrj static bool
analyze_ref(struct loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple * stmt)489*38fd1498Szrj analyze_ref (struct loop *loop, tree *ref_p, tree *base,
490*38fd1498Szrj 	     tree *step, HOST_WIDE_INT *delta,
491*38fd1498Szrj 	     gimple *stmt)
492*38fd1498Szrj {
493*38fd1498Szrj   struct ar_data ar_data;
494*38fd1498Szrj   tree off;
495*38fd1498Szrj   HOST_WIDE_INT bit_offset;
496*38fd1498Szrj   tree ref = *ref_p;
497*38fd1498Szrj 
498*38fd1498Szrj   *step = NULL_TREE;
499*38fd1498Szrj   *delta = 0;
500*38fd1498Szrj 
501*38fd1498Szrj   /* First strip off the component references.  Ignore bitfields.
502*38fd1498Szrj      Also strip off the real and imagine parts of a complex, so that
503*38fd1498Szrj      they can have the same base.  */
504*38fd1498Szrj   if (TREE_CODE (ref) == REALPART_EXPR
505*38fd1498Szrj       || TREE_CODE (ref) == IMAGPART_EXPR
506*38fd1498Szrj       || (TREE_CODE (ref) == COMPONENT_REF
507*38fd1498Szrj           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
508*38fd1498Szrj     {
509*38fd1498Szrj       if (TREE_CODE (ref) == IMAGPART_EXPR)
510*38fd1498Szrj         *delta += int_size_in_bytes (TREE_TYPE (ref));
511*38fd1498Szrj       ref = TREE_OPERAND (ref, 0);
512*38fd1498Szrj     }
513*38fd1498Szrj 
514*38fd1498Szrj   *ref_p = ref;
515*38fd1498Szrj 
516*38fd1498Szrj   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
517*38fd1498Szrj     {
518*38fd1498Szrj       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
519*38fd1498Szrj       bit_offset = TREE_INT_CST_LOW (off);
520*38fd1498Szrj       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
521*38fd1498Szrj 
522*38fd1498Szrj       *delta += bit_offset / BITS_PER_UNIT;
523*38fd1498Szrj     }
524*38fd1498Szrj 
525*38fd1498Szrj   *base = unshare_expr (ref);
526*38fd1498Szrj   ar_data.loop = loop;
527*38fd1498Szrj   ar_data.stmt = stmt;
528*38fd1498Szrj   ar_data.step = step;
529*38fd1498Szrj   ar_data.delta = delta;
530*38fd1498Szrj   return for_each_index (base, idx_analyze_ref, &ar_data);
531*38fd1498Szrj }
532*38fd1498Szrj 
533*38fd1498Szrj /* Record a memory reference REF to the list REFS.  The reference occurs in
534*38fd1498Szrj    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
535*38fd1498Szrj    reference was recorded, false otherwise.  */
536*38fd1498Szrj 
537*38fd1498Szrj static bool
gather_memory_references_ref(struct loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple * stmt)538*38fd1498Szrj gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
539*38fd1498Szrj 			      tree ref, bool write_p, gimple *stmt)
540*38fd1498Szrj {
541*38fd1498Szrj   tree base, step;
542*38fd1498Szrj   HOST_WIDE_INT delta;
543*38fd1498Szrj   struct mem_ref_group *agrp;
544*38fd1498Szrj 
545*38fd1498Szrj   if (get_base_address (ref) == NULL)
546*38fd1498Szrj     return false;
547*38fd1498Szrj 
548*38fd1498Szrj   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
549*38fd1498Szrj     return false;
550*38fd1498Szrj   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
551*38fd1498Szrj   if (step == NULL_TREE)
552*38fd1498Szrj     return false;
553*38fd1498Szrj 
554*38fd1498Szrj   /* Stop if the address of BASE could not be taken.  */
555*38fd1498Szrj   if (may_be_nonaddressable_p (base))
556*38fd1498Szrj     return false;
557*38fd1498Szrj 
558*38fd1498Szrj   /* Limit non-constant step prefetching only to the innermost loops and
559*38fd1498Szrj      only when the step is loop invariant in the entire loop nest. */
560*38fd1498Szrj   if (!cst_and_fits_in_hwi (step))
561*38fd1498Szrj     {
562*38fd1498Szrj       if (loop->inner != NULL)
563*38fd1498Szrj         {
564*38fd1498Szrj           if (dump_file && (dump_flags & TDF_DETAILS))
565*38fd1498Szrj             {
566*38fd1498Szrj               fprintf (dump_file, "Memory expression %p\n",(void *) ref );
567*38fd1498Szrj 	      print_generic_expr (dump_file, ref, TDF_SLIM);
568*38fd1498Szrj 	      fprintf (dump_file,":");
569*38fd1498Szrj               dump_mem_details (dump_file, base, step, delta, write_p);
570*38fd1498Szrj               fprintf (dump_file,
571*38fd1498Szrj                        "Ignoring %p, non-constant step prefetching is "
572*38fd1498Szrj                        "limited to inner most loops \n",
573*38fd1498Szrj                        (void *) ref);
574*38fd1498Szrj             }
575*38fd1498Szrj             return false;
576*38fd1498Szrj          }
577*38fd1498Szrj       else
578*38fd1498Szrj         {
579*38fd1498Szrj           if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
580*38fd1498Szrj           {
581*38fd1498Szrj             if (dump_file && (dump_flags & TDF_DETAILS))
582*38fd1498Szrj               {
583*38fd1498Szrj                 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
584*38fd1498Szrj 		print_generic_expr (dump_file, ref, TDF_SLIM);
585*38fd1498Szrj                 fprintf (dump_file,":");
586*38fd1498Szrj                 dump_mem_details (dump_file, base, step, delta, write_p);
587*38fd1498Szrj                 fprintf (dump_file,
588*38fd1498Szrj                          "Not prefetching, ignoring %p due to "
589*38fd1498Szrj                          "loop variant step\n",
590*38fd1498Szrj                          (void *) ref);
591*38fd1498Szrj               }
592*38fd1498Szrj               return false;
593*38fd1498Szrj             }
594*38fd1498Szrj         }
595*38fd1498Szrj     }
596*38fd1498Szrj 
597*38fd1498Szrj   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
598*38fd1498Szrj      are integer constants.  */
599*38fd1498Szrj   agrp = find_or_create_group (refs, base, step);
600*38fd1498Szrj   record_ref (agrp, stmt, ref, delta, write_p);
601*38fd1498Szrj 
602*38fd1498Szrj   return true;
603*38fd1498Szrj }
604*38fd1498Szrj 
605*38fd1498Szrj /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
606*38fd1498Szrj    true if there are no other memory references inside the loop.  */
607*38fd1498Szrj 
608*38fd1498Szrj static struct mem_ref_group *
gather_memory_references(struct loop * loop,bool * no_other_refs,unsigned * ref_count)609*38fd1498Szrj gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
610*38fd1498Szrj {
611*38fd1498Szrj   basic_block *body = get_loop_body_in_dom_order (loop);
612*38fd1498Szrj   basic_block bb;
613*38fd1498Szrj   unsigned i;
614*38fd1498Szrj   gimple_stmt_iterator bsi;
615*38fd1498Szrj   gimple *stmt;
616*38fd1498Szrj   tree lhs, rhs;
617*38fd1498Szrj   struct mem_ref_group *refs = NULL;
618*38fd1498Szrj 
619*38fd1498Szrj   *no_other_refs = true;
620*38fd1498Szrj   *ref_count = 0;
621*38fd1498Szrj 
622*38fd1498Szrj   /* Scan the loop body in order, so that the former references precede the
623*38fd1498Szrj      later ones.  */
624*38fd1498Szrj   for (i = 0; i < loop->num_nodes; i++)
625*38fd1498Szrj     {
626*38fd1498Szrj       bb = body[i];
627*38fd1498Szrj       if (bb->loop_father != loop)
628*38fd1498Szrj 	continue;
629*38fd1498Szrj 
630*38fd1498Szrj       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
631*38fd1498Szrj 	{
632*38fd1498Szrj 	  stmt = gsi_stmt (bsi);
633*38fd1498Szrj 
634*38fd1498Szrj 	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
635*38fd1498Szrj 	    {
636*38fd1498Szrj 	      if (gimple_vuse (stmt)
637*38fd1498Szrj 		  || (is_gimple_call (stmt)
638*38fd1498Szrj 		      && !(gimple_call_flags (stmt) & ECF_CONST)))
639*38fd1498Szrj 		*no_other_refs = false;
640*38fd1498Szrj 	      continue;
641*38fd1498Szrj 	    }
642*38fd1498Szrj 
643*38fd1498Szrj 	  if (! gimple_vuse (stmt))
644*38fd1498Szrj 	    continue;
645*38fd1498Szrj 
646*38fd1498Szrj 	  lhs = gimple_assign_lhs (stmt);
647*38fd1498Szrj 	  rhs = gimple_assign_rhs1 (stmt);
648*38fd1498Szrj 
649*38fd1498Szrj 	  if (REFERENCE_CLASS_P (rhs))
650*38fd1498Szrj 	    {
651*38fd1498Szrj 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
652*38fd1498Szrj 							    rhs, false, stmt);
653*38fd1498Szrj 	    *ref_count += 1;
654*38fd1498Szrj 	    }
655*38fd1498Szrj 	  if (REFERENCE_CLASS_P (lhs))
656*38fd1498Szrj 	    {
657*38fd1498Szrj 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
658*38fd1498Szrj 							    lhs, true, stmt);
659*38fd1498Szrj 	    *ref_count += 1;
660*38fd1498Szrj 	    }
661*38fd1498Szrj 	}
662*38fd1498Szrj     }
663*38fd1498Szrj   free (body);
664*38fd1498Szrj 
665*38fd1498Szrj   return refs;
666*38fd1498Szrj }
667*38fd1498Szrj 
668*38fd1498Szrj /* Prune the prefetch candidate REF using the self-reuse.  */
669*38fd1498Szrj 
670*38fd1498Szrj static void
prune_ref_by_self_reuse(struct mem_ref * ref)671*38fd1498Szrj prune_ref_by_self_reuse (struct mem_ref *ref)
672*38fd1498Szrj {
673*38fd1498Szrj   HOST_WIDE_INT step;
674*38fd1498Szrj   bool backward;
675*38fd1498Szrj 
676*38fd1498Szrj   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
677*38fd1498Szrj   if (!cst_and_fits_in_hwi (ref->group->step))
678*38fd1498Szrj     return;
679*38fd1498Szrj 
680*38fd1498Szrj   step = int_cst_value (ref->group->step);
681*38fd1498Szrj 
682*38fd1498Szrj   backward = step < 0;
683*38fd1498Szrj 
684*38fd1498Szrj   if (step == 0)
685*38fd1498Szrj     {
686*38fd1498Szrj       /* Prefetch references to invariant address just once.  */
687*38fd1498Szrj       ref->prefetch_before = 1;
688*38fd1498Szrj       return;
689*38fd1498Szrj     }
690*38fd1498Szrj 
691*38fd1498Szrj   if (backward)
692*38fd1498Szrj     step = -step;
693*38fd1498Szrj 
694*38fd1498Szrj   if (step > PREFETCH_BLOCK)
695*38fd1498Szrj     return;
696*38fd1498Szrj 
697*38fd1498Szrj   if ((backward && HAVE_BACKWARD_PREFETCH)
698*38fd1498Szrj       || (!backward && HAVE_FORWARD_PREFETCH))
699*38fd1498Szrj     {
700*38fd1498Szrj       ref->prefetch_before = 1;
701*38fd1498Szrj       return;
702*38fd1498Szrj     }
703*38fd1498Szrj 
704*38fd1498Szrj   ref->prefetch_mod = PREFETCH_BLOCK / step;
705*38fd1498Szrj }
706*38fd1498Szrj 
707*38fd1498Szrj /* Divides X by BY, rounding down.  */
708*38fd1498Szrj 
709*38fd1498Szrj static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)710*38fd1498Szrj ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
711*38fd1498Szrj {
712*38fd1498Szrj   gcc_assert (by > 0);
713*38fd1498Szrj 
714*38fd1498Szrj   if (x >= 0)
715*38fd1498Szrj     return x / (HOST_WIDE_INT) by;
716*38fd1498Szrj   else
717*38fd1498Szrj     return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
718*38fd1498Szrj }
719*38fd1498Szrj 
720*38fd1498Szrj /* Given a CACHE_LINE_SIZE and two inductive memory references
721*38fd1498Szrj    with a common STEP greater than CACHE_LINE_SIZE and an address
722*38fd1498Szrj    difference DELTA, compute the probability that they will fall
723*38fd1498Szrj    in different cache lines.  Return true if the computed miss rate
724*38fd1498Szrj    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
725*38fd1498Szrj    number of distinct iterations after which the pattern repeats itself.
726*38fd1498Szrj    ALIGN_UNIT is the unit of alignment in bytes.  */
727*38fd1498Szrj 
728*38fd1498Szrj static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)729*38fd1498Szrj is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
730*38fd1498Szrj 		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
731*38fd1498Szrj 		   unsigned HOST_WIDE_INT distinct_iters,
732*38fd1498Szrj 		   int align_unit)
733*38fd1498Szrj {
734*38fd1498Szrj   unsigned align, iter;
735*38fd1498Szrj   int total_positions, miss_positions, max_allowed_miss_positions;
736*38fd1498Szrj   int address1, address2, cache_line1, cache_line2;
737*38fd1498Szrj 
738*38fd1498Szrj   /* It always misses if delta is greater than or equal to the cache
739*38fd1498Szrj      line size.  */
740*38fd1498Szrj   if (delta >= (HOST_WIDE_INT) cache_line_size)
741*38fd1498Szrj     return false;
742*38fd1498Szrj 
743*38fd1498Szrj   miss_positions = 0;
744*38fd1498Szrj   total_positions = (cache_line_size / align_unit) * distinct_iters;
745*38fd1498Szrj   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
746*38fd1498Szrj 
747*38fd1498Szrj   /* Iterate through all possible alignments of the first
748*38fd1498Szrj      memory reference within its cache line.  */
749*38fd1498Szrj   for (align = 0; align < cache_line_size; align += align_unit)
750*38fd1498Szrj 
751*38fd1498Szrj     /* Iterate through all distinct iterations.  */
752*38fd1498Szrj     for (iter = 0; iter < distinct_iters; iter++)
753*38fd1498Szrj       {
754*38fd1498Szrj 	address1 = align + step * iter;
755*38fd1498Szrj 	address2 = address1 + delta;
756*38fd1498Szrj 	cache_line1 = address1 / cache_line_size;
757*38fd1498Szrj 	cache_line2 = address2 / cache_line_size;
758*38fd1498Szrj 	if (cache_line1 != cache_line2)
759*38fd1498Szrj 	  {
760*38fd1498Szrj 	    miss_positions += 1;
761*38fd1498Szrj             if (miss_positions > max_allowed_miss_positions)
762*38fd1498Szrj 	      return false;
763*38fd1498Szrj           }
764*38fd1498Szrj       }
765*38fd1498Szrj   return true;
766*38fd1498Szrj }
767*38fd1498Szrj 
768*38fd1498Szrj /* Prune the prefetch candidate REF using the reuse with BY.
769*38fd1498Szrj    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
770*38fd1498Szrj 
771*38fd1498Szrj static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)772*38fd1498Szrj prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
773*38fd1498Szrj 			  bool by_is_before)
774*38fd1498Szrj {
775*38fd1498Szrj   HOST_WIDE_INT step;
776*38fd1498Szrj   bool backward;
777*38fd1498Szrj   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
778*38fd1498Szrj   HOST_WIDE_INT delta = delta_b - delta_r;
779*38fd1498Szrj   HOST_WIDE_INT hit_from;
780*38fd1498Szrj   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
781*38fd1498Szrj   HOST_WIDE_INT reduced_step;
782*38fd1498Szrj   unsigned HOST_WIDE_INT reduced_prefetch_block;
783*38fd1498Szrj   tree ref_type;
784*38fd1498Szrj   int align_unit;
785*38fd1498Szrj 
786*38fd1498Szrj   /* If the step is non constant we cannot calculate prefetch_before.  */
787*38fd1498Szrj   if (!cst_and_fits_in_hwi (ref->group->step)) {
788*38fd1498Szrj     return;
789*38fd1498Szrj   }
790*38fd1498Szrj 
791*38fd1498Szrj   step = int_cst_value (ref->group->step);
792*38fd1498Szrj 
793*38fd1498Szrj   backward = step < 0;
794*38fd1498Szrj 
795*38fd1498Szrj 
796*38fd1498Szrj   if (delta == 0)
797*38fd1498Szrj     {
798*38fd1498Szrj       /* If the references has the same address, only prefetch the
799*38fd1498Szrj 	 former.  */
800*38fd1498Szrj       if (by_is_before)
801*38fd1498Szrj 	ref->prefetch_before = 0;
802*38fd1498Szrj 
803*38fd1498Szrj       return;
804*38fd1498Szrj     }
805*38fd1498Szrj 
806*38fd1498Szrj   if (!step)
807*38fd1498Szrj     {
808*38fd1498Szrj       /* If the reference addresses are invariant and fall into the
809*38fd1498Szrj 	 same cache line, prefetch just the first one.  */
810*38fd1498Szrj       if (!by_is_before)
811*38fd1498Szrj 	return;
812*38fd1498Szrj 
813*38fd1498Szrj       if (ddown (ref->delta, PREFETCH_BLOCK)
814*38fd1498Szrj 	  != ddown (by->delta, PREFETCH_BLOCK))
815*38fd1498Szrj 	return;
816*38fd1498Szrj 
817*38fd1498Szrj       ref->prefetch_before = 0;
818*38fd1498Szrj       return;
819*38fd1498Szrj     }
820*38fd1498Szrj 
821*38fd1498Szrj   /* Only prune the reference that is behind in the array.  */
822*38fd1498Szrj   if (backward)
823*38fd1498Szrj     {
824*38fd1498Szrj       if (delta > 0)
825*38fd1498Szrj 	return;
826*38fd1498Szrj 
827*38fd1498Szrj       /* Transform the data so that we may assume that the accesses
828*38fd1498Szrj 	 are forward.  */
829*38fd1498Szrj       delta = - delta;
830*38fd1498Szrj       step = -step;
831*38fd1498Szrj       delta_r = PREFETCH_BLOCK - 1 - delta_r;
832*38fd1498Szrj       delta_b = PREFETCH_BLOCK - 1 - delta_b;
833*38fd1498Szrj     }
834*38fd1498Szrj   else
835*38fd1498Szrj     {
836*38fd1498Szrj       if (delta < 0)
837*38fd1498Szrj 	return;
838*38fd1498Szrj     }
839*38fd1498Szrj 
840*38fd1498Szrj   /* Check whether the two references are likely to hit the same cache
841*38fd1498Szrj      line, and how distant the iterations in that it occurs are from
842*38fd1498Szrj      each other.  */
843*38fd1498Szrj 
844*38fd1498Szrj   if (step <= PREFETCH_BLOCK)
845*38fd1498Szrj     {
846*38fd1498Szrj       /* The accesses are sure to meet.  Let us check when.  */
847*38fd1498Szrj       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
848*38fd1498Szrj       prefetch_before = (hit_from - delta_r + step - 1) / step;
849*38fd1498Szrj 
850*38fd1498Szrj       /* Do not reduce prefetch_before if we meet beyond cache size.  */
851*38fd1498Szrj       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
852*38fd1498Szrj         prefetch_before = PREFETCH_ALL;
853*38fd1498Szrj       if (prefetch_before < ref->prefetch_before)
854*38fd1498Szrj 	ref->prefetch_before = prefetch_before;
855*38fd1498Szrj 
856*38fd1498Szrj       return;
857*38fd1498Szrj     }
858*38fd1498Szrj 
859*38fd1498Szrj   /* A more complicated case with step > prefetch_block.  First reduce
860*38fd1498Szrj      the ratio between the step and the cache line size to its simplest
861*38fd1498Szrj      terms.  The resulting denominator will then represent the number of
862*38fd1498Szrj      distinct iterations after which each address will go back to its
863*38fd1498Szrj      initial location within the cache line.  This computation assumes
864*38fd1498Szrj      that PREFETCH_BLOCK is a power of two.  */
865*38fd1498Szrj   prefetch_block = PREFETCH_BLOCK;
866*38fd1498Szrj   reduced_prefetch_block = prefetch_block;
867*38fd1498Szrj   reduced_step = step;
868*38fd1498Szrj   while ((reduced_step & 1) == 0
869*38fd1498Szrj 	 && reduced_prefetch_block > 1)
870*38fd1498Szrj     {
871*38fd1498Szrj       reduced_step >>= 1;
872*38fd1498Szrj       reduced_prefetch_block >>= 1;
873*38fd1498Szrj     }
874*38fd1498Szrj 
875*38fd1498Szrj   prefetch_before = delta / step;
876*38fd1498Szrj   delta %= step;
877*38fd1498Szrj   ref_type = TREE_TYPE (ref->mem);
878*38fd1498Szrj   align_unit = TYPE_ALIGN (ref_type) / 8;
879*38fd1498Szrj   if (is_miss_rate_acceptable (prefetch_block, step, delta,
880*38fd1498Szrj 			       reduced_prefetch_block, align_unit))
881*38fd1498Szrj     {
882*38fd1498Szrj       /* Do not reduce prefetch_before if we meet beyond cache size.  */
883*38fd1498Szrj       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
884*38fd1498Szrj         prefetch_before = PREFETCH_ALL;
885*38fd1498Szrj       if (prefetch_before < ref->prefetch_before)
886*38fd1498Szrj 	ref->prefetch_before = prefetch_before;
887*38fd1498Szrj 
888*38fd1498Szrj       return;
889*38fd1498Szrj     }
890*38fd1498Szrj 
891*38fd1498Szrj   /* Try also the following iteration.  */
892*38fd1498Szrj   prefetch_before++;
893*38fd1498Szrj   delta = step - delta;
894*38fd1498Szrj   if (is_miss_rate_acceptable (prefetch_block, step, delta,
895*38fd1498Szrj 			       reduced_prefetch_block, align_unit))
896*38fd1498Szrj     {
897*38fd1498Szrj       if (prefetch_before < ref->prefetch_before)
898*38fd1498Szrj 	ref->prefetch_before = prefetch_before;
899*38fd1498Szrj 
900*38fd1498Szrj       return;
901*38fd1498Szrj     }
902*38fd1498Szrj 
903*38fd1498Szrj   /* The ref probably does not reuse by.  */
904*38fd1498Szrj   return;
905*38fd1498Szrj }
906*38fd1498Szrj 
907*38fd1498Szrj /* Prune the prefetch candidate REF using the reuses with other references
908*38fd1498Szrj    in REFS.  */
909*38fd1498Szrj 
910*38fd1498Szrj static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)911*38fd1498Szrj prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
912*38fd1498Szrj {
913*38fd1498Szrj   struct mem_ref *prune_by;
914*38fd1498Szrj   bool before = true;
915*38fd1498Szrj 
916*38fd1498Szrj   prune_ref_by_self_reuse (ref);
917*38fd1498Szrj 
918*38fd1498Szrj   for (prune_by = refs; prune_by; prune_by = prune_by->next)
919*38fd1498Szrj     {
920*38fd1498Szrj       if (prune_by == ref)
921*38fd1498Szrj 	{
922*38fd1498Szrj 	  before = false;
923*38fd1498Szrj 	  continue;
924*38fd1498Szrj 	}
925*38fd1498Szrj 
926*38fd1498Szrj       if (!WRITE_CAN_USE_READ_PREFETCH
927*38fd1498Szrj 	  && ref->write_p
928*38fd1498Szrj 	  && !prune_by->write_p)
929*38fd1498Szrj 	continue;
930*38fd1498Szrj       if (!READ_CAN_USE_WRITE_PREFETCH
931*38fd1498Szrj 	  && !ref->write_p
932*38fd1498Szrj 	  && prune_by->write_p)
933*38fd1498Szrj 	continue;
934*38fd1498Szrj 
935*38fd1498Szrj       prune_ref_by_group_reuse (ref, prune_by, before);
936*38fd1498Szrj     }
937*38fd1498Szrj }
938*38fd1498Szrj 
939*38fd1498Szrj /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
940*38fd1498Szrj 
941*38fd1498Szrj static void
prune_group_by_reuse(struct mem_ref_group * group)942*38fd1498Szrj prune_group_by_reuse (struct mem_ref_group *group)
943*38fd1498Szrj {
944*38fd1498Szrj   struct mem_ref *ref_pruned;
945*38fd1498Szrj 
946*38fd1498Szrj   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
947*38fd1498Szrj     {
948*38fd1498Szrj       prune_ref_by_reuse (ref_pruned, group->refs);
949*38fd1498Szrj 
950*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
951*38fd1498Szrj 	{
952*38fd1498Szrj 	  dump_mem_ref (dump_file, ref_pruned);
953*38fd1498Szrj 
954*38fd1498Szrj 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
955*38fd1498Szrj 	      && ref_pruned->prefetch_mod == 1)
956*38fd1498Szrj 	    fprintf (dump_file, " no restrictions");
957*38fd1498Szrj 	  else if (ref_pruned->prefetch_before == 0)
958*38fd1498Szrj 	    fprintf (dump_file, " do not prefetch");
959*38fd1498Szrj 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
960*38fd1498Szrj 	    fprintf (dump_file, " prefetch once");
961*38fd1498Szrj 	  else
962*38fd1498Szrj 	    {
963*38fd1498Szrj 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
964*38fd1498Szrj 		{
965*38fd1498Szrj 		  fprintf (dump_file, " prefetch before ");
966*38fd1498Szrj 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
967*38fd1498Szrj 			   ref_pruned->prefetch_before);
968*38fd1498Szrj 		}
969*38fd1498Szrj 	      if (ref_pruned->prefetch_mod != 1)
970*38fd1498Szrj 		{
971*38fd1498Szrj 		  fprintf (dump_file, " prefetch mod ");
972*38fd1498Szrj 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
973*38fd1498Szrj 			   ref_pruned->prefetch_mod);
974*38fd1498Szrj 		}
975*38fd1498Szrj 	    }
976*38fd1498Szrj 	  fprintf (dump_file, "\n");
977*38fd1498Szrj 	}
978*38fd1498Szrj     }
979*38fd1498Szrj }
980*38fd1498Szrj 
981*38fd1498Szrj /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
982*38fd1498Szrj 
983*38fd1498Szrj static void
prune_by_reuse(struct mem_ref_group * groups)984*38fd1498Szrj prune_by_reuse (struct mem_ref_group *groups)
985*38fd1498Szrj {
986*38fd1498Szrj   for (; groups; groups = groups->next)
987*38fd1498Szrj     prune_group_by_reuse (groups);
988*38fd1498Szrj }
989*38fd1498Szrj 
990*38fd1498Szrj /* Returns true if we should issue prefetch for REF.  */
991*38fd1498Szrj 
992*38fd1498Szrj static bool
should_issue_prefetch_p(struct mem_ref * ref)993*38fd1498Szrj should_issue_prefetch_p (struct mem_ref *ref)
994*38fd1498Szrj {
995*38fd1498Szrj   /* For now do not issue prefetches for only first few of the
996*38fd1498Szrj      iterations.  */
997*38fd1498Szrj   if (ref->prefetch_before != PREFETCH_ALL)
998*38fd1498Szrj     {
999*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1000*38fd1498Szrj         fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1001*38fd1498Szrj 		 ref->group->uid, ref->uid);
1002*38fd1498Szrj       return false;
1003*38fd1498Szrj     }
1004*38fd1498Szrj 
1005*38fd1498Szrj   /* Do not prefetch nontemporal stores.  */
1006*38fd1498Szrj   if (ref->storent_p)
1007*38fd1498Szrj     {
1008*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1009*38fd1498Szrj         fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
1010*38fd1498Szrj       return false;
1011*38fd1498Szrj     }
1012*38fd1498Szrj 
1013*38fd1498Szrj   return true;
1014*38fd1498Szrj }
1015*38fd1498Szrj 
1016*38fd1498Szrj /* Decide which of the prefetch candidates in GROUPS to prefetch.
1017*38fd1498Szrj    AHEAD is the number of iterations to prefetch ahead (which corresponds
1018*38fd1498Szrj    to the number of simultaneous instances of one prefetch running at a
1019*38fd1498Szrj    time).  UNROLL_FACTOR is the factor by that the loop is going to be
1020*38fd1498Szrj    unrolled.  Returns true if there is anything to prefetch.  */
1021*38fd1498Szrj 
1022*38fd1498Szrj static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1023*38fd1498Szrj schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1024*38fd1498Szrj 		     unsigned ahead)
1025*38fd1498Szrj {
1026*38fd1498Szrj   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1027*38fd1498Szrj   unsigned slots_per_prefetch;
1028*38fd1498Szrj   struct mem_ref *ref;
1029*38fd1498Szrj   bool any = false;
1030*38fd1498Szrj 
1031*38fd1498Szrj   /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
1032*38fd1498Szrj   remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1033*38fd1498Szrj 
1034*38fd1498Szrj   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1035*38fd1498Szrj      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
1036*38fd1498Szrj      it will need a prefetch slot.  */
1037*38fd1498Szrj   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1038*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1039*38fd1498Szrj     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1040*38fd1498Szrj 	     slots_per_prefetch);
1041*38fd1498Szrj 
1042*38fd1498Szrj   /* For now we just take memory references one by one and issue
1043*38fd1498Szrj      prefetches for as many as possible.  The groups are sorted
1044*38fd1498Szrj      starting with the largest step, since the references with
1045*38fd1498Szrj      large step are more likely to cause many cache misses.  */
1046*38fd1498Szrj 
1047*38fd1498Szrj   for (; groups; groups = groups->next)
1048*38fd1498Szrj     for (ref = groups->refs; ref; ref = ref->next)
1049*38fd1498Szrj       {
1050*38fd1498Szrj 	if (!should_issue_prefetch_p (ref))
1051*38fd1498Szrj 	  continue;
1052*38fd1498Szrj 
1053*38fd1498Szrj         /* The loop is far from being sufficiently unrolled for this
1054*38fd1498Szrj            prefetch.  Do not generate prefetch to avoid many redudant
1055*38fd1498Szrj            prefetches.  */
1056*38fd1498Szrj         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1057*38fd1498Szrj           continue;
1058*38fd1498Szrj 
1059*38fd1498Szrj 	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
1060*38fd1498Szrj 	   and we unroll the loop UNROLL_FACTOR times, we need to insert
1061*38fd1498Szrj 	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1062*38fd1498Szrj 	   iteration.  */
1063*38fd1498Szrj 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1064*38fd1498Szrj 			/ ref->prefetch_mod);
1065*38fd1498Szrj 	prefetch_slots = n_prefetches * slots_per_prefetch;
1066*38fd1498Szrj 
1067*38fd1498Szrj 	/* If more than half of the prefetches would be lost anyway, do not
1068*38fd1498Szrj 	   issue the prefetch.  */
1069*38fd1498Szrj 	if (2 * remaining_prefetch_slots < prefetch_slots)
1070*38fd1498Szrj 	  continue;
1071*38fd1498Szrj 
1072*38fd1498Szrj 	/* Stop prefetching if debug counter is activated.  */
1073*38fd1498Szrj 	if (!dbg_cnt (prefetch))
1074*38fd1498Szrj 	  continue;
1075*38fd1498Szrj 
1076*38fd1498Szrj 	ref->issue_prefetch_p = true;
1077*38fd1498Szrj 	if (dump_file && (dump_flags & TDF_DETAILS))
1078*38fd1498Szrj 	  fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1079*38fd1498Szrj 		   ref->group->uid, ref->uid);
1080*38fd1498Szrj 
1081*38fd1498Szrj 	if (remaining_prefetch_slots <= prefetch_slots)
1082*38fd1498Szrj 	  return true;
1083*38fd1498Szrj 	remaining_prefetch_slots -= prefetch_slots;
1084*38fd1498Szrj 	any = true;
1085*38fd1498Szrj       }
1086*38fd1498Szrj 
1087*38fd1498Szrj   return any;
1088*38fd1498Szrj }
1089*38fd1498Szrj 
1090*38fd1498Szrj /* Return TRUE if no prefetch is going to be generated in the given
1091*38fd1498Szrj    GROUPS.  */
1092*38fd1498Szrj 
1093*38fd1498Szrj static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)1094*38fd1498Szrj nothing_to_prefetch_p (struct mem_ref_group *groups)
1095*38fd1498Szrj {
1096*38fd1498Szrj   struct mem_ref *ref;
1097*38fd1498Szrj 
1098*38fd1498Szrj   for (; groups; groups = groups->next)
1099*38fd1498Szrj     for (ref = groups->refs; ref; ref = ref->next)
1100*38fd1498Szrj       if (should_issue_prefetch_p (ref))
1101*38fd1498Szrj 	return false;
1102*38fd1498Szrj 
1103*38fd1498Szrj   return true;
1104*38fd1498Szrj }
1105*38fd1498Szrj 
1106*38fd1498Szrj /* Estimate the number of prefetches in the given GROUPS.
1107*38fd1498Szrj    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1108*38fd1498Szrj 
1109*38fd1498Szrj static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)1110*38fd1498Szrj estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1111*38fd1498Szrj {
1112*38fd1498Szrj   struct mem_ref *ref;
1113*38fd1498Szrj   unsigned n_prefetches;
1114*38fd1498Szrj   int prefetch_count = 0;
1115*38fd1498Szrj 
1116*38fd1498Szrj   for (; groups; groups = groups->next)
1117*38fd1498Szrj     for (ref = groups->refs; ref; ref = ref->next)
1118*38fd1498Szrj       if (should_issue_prefetch_p (ref))
1119*38fd1498Szrj 	{
1120*38fd1498Szrj 	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1121*38fd1498Szrj 			  / ref->prefetch_mod);
1122*38fd1498Szrj 	  prefetch_count += n_prefetches;
1123*38fd1498Szrj 	}
1124*38fd1498Szrj 
1125*38fd1498Szrj   return prefetch_count;
1126*38fd1498Szrj }
1127*38fd1498Szrj 
1128*38fd1498Szrj /* Issue prefetches for the reference REF into loop as decided before.
1129*38fd1498Szrj    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1130*38fd1498Szrj    is the factor by which LOOP was unrolled.  */
1131*38fd1498Szrj 
1132*38fd1498Szrj static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)1133*38fd1498Szrj issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1134*38fd1498Szrj {
1135*38fd1498Szrj   HOST_WIDE_INT delta;
1136*38fd1498Szrj   tree addr, addr_base, write_p, local, forward;
1137*38fd1498Szrj   gcall *prefetch;
1138*38fd1498Szrj   gimple_stmt_iterator bsi;
1139*38fd1498Szrj   unsigned n_prefetches, ap;
1140*38fd1498Szrj   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1141*38fd1498Szrj 
1142*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1143*38fd1498Szrj     fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
1144*38fd1498Szrj 	     nontemporal ? " nontemporal" : "",
1145*38fd1498Szrj 	     ref->group->uid, ref->uid);
1146*38fd1498Szrj 
1147*38fd1498Szrj   bsi = gsi_for_stmt (ref->stmt);
1148*38fd1498Szrj 
1149*38fd1498Szrj   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1150*38fd1498Szrj 		  / ref->prefetch_mod);
1151*38fd1498Szrj   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1152*38fd1498Szrj   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1153*38fd1498Szrj 					true, NULL, true, GSI_SAME_STMT);
1154*38fd1498Szrj   write_p = ref->write_p ? integer_one_node : integer_zero_node;
1155*38fd1498Szrj   local = nontemporal ? integer_zero_node : integer_three_node;
1156*38fd1498Szrj 
1157*38fd1498Szrj   for (ap = 0; ap < n_prefetches; ap++)
1158*38fd1498Szrj     {
1159*38fd1498Szrj       if (cst_and_fits_in_hwi (ref->group->step))
1160*38fd1498Szrj         {
1161*38fd1498Szrj           /* Determine the address to prefetch.  */
1162*38fd1498Szrj           delta = (ahead + ap * ref->prefetch_mod) *
1163*38fd1498Szrj 		   int_cst_value (ref->group->step);
1164*38fd1498Szrj           addr = fold_build_pointer_plus_hwi (addr_base, delta);
1165*38fd1498Szrj           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1166*38fd1498Szrj 					   NULL, true, GSI_SAME_STMT);
1167*38fd1498Szrj         }
1168*38fd1498Szrj       else
1169*38fd1498Szrj         {
1170*38fd1498Szrj           /* The step size is non-constant but loop-invariant.  We use the
1171*38fd1498Szrj              heuristic to simply prefetch ahead iterations ahead.  */
1172*38fd1498Szrj           forward = fold_build2 (MULT_EXPR, sizetype,
1173*38fd1498Szrj                                  fold_convert (sizetype, ref->group->step),
1174*38fd1498Szrj                                  fold_convert (sizetype, size_int (ahead)));
1175*38fd1498Szrj           addr = fold_build_pointer_plus (addr_base, forward);
1176*38fd1498Szrj           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1177*38fd1498Szrj 					   NULL, true, GSI_SAME_STMT);
1178*38fd1498Szrj       }
1179*38fd1498Szrj 
1180*38fd1498Szrj       if (addr_base != addr
1181*38fd1498Szrj 	  && TREE_CODE (addr_base) == SSA_NAME
1182*38fd1498Szrj 	  && TREE_CODE (addr) == SSA_NAME)
1183*38fd1498Szrj 	{
1184*38fd1498Szrj 	  duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1185*38fd1498Szrj 	  /* As this isn't a plain copy we have to reset alignment
1186*38fd1498Szrj 	     information.  */
1187*38fd1498Szrj 	  if (SSA_NAME_PTR_INFO (addr))
1188*38fd1498Szrj 	    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1189*38fd1498Szrj 	}
1190*38fd1498Szrj 
1191*38fd1498Szrj       /* Create the prefetch instruction.  */
1192*38fd1498Szrj       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1193*38fd1498Szrj 				    3, addr, write_p, local);
1194*38fd1498Szrj       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1195*38fd1498Szrj     }
1196*38fd1498Szrj }
1197*38fd1498Szrj 
1198*38fd1498Szrj /* Issue prefetches for the references in GROUPS into loop as decided before.
1199*38fd1498Szrj    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1200*38fd1498Szrj    factor by that LOOP was unrolled.  */
1201*38fd1498Szrj 
1202*38fd1498Szrj static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1203*38fd1498Szrj issue_prefetches (struct mem_ref_group *groups,
1204*38fd1498Szrj 		  unsigned unroll_factor, unsigned ahead)
1205*38fd1498Szrj {
1206*38fd1498Szrj   struct mem_ref *ref;
1207*38fd1498Szrj 
1208*38fd1498Szrj   for (; groups; groups = groups->next)
1209*38fd1498Szrj     for (ref = groups->refs; ref; ref = ref->next)
1210*38fd1498Szrj       if (ref->issue_prefetch_p)
1211*38fd1498Szrj 	issue_prefetch_ref (ref, unroll_factor, ahead);
1212*38fd1498Szrj }
1213*38fd1498Szrj 
1214*38fd1498Szrj /* Returns true if REF is a memory write for that a nontemporal store insn
1215*38fd1498Szrj    can be used.  */
1216*38fd1498Szrj 
1217*38fd1498Szrj static bool
nontemporal_store_p(struct mem_ref * ref)1218*38fd1498Szrj nontemporal_store_p (struct mem_ref *ref)
1219*38fd1498Szrj {
1220*38fd1498Szrj   machine_mode mode;
1221*38fd1498Szrj   enum insn_code code;
1222*38fd1498Szrj 
1223*38fd1498Szrj   /* REF must be a write that is not reused.  We require it to be independent
1224*38fd1498Szrj      on all other memory references in the loop, as the nontemporal stores may
1225*38fd1498Szrj      be reordered with respect to other memory references.  */
1226*38fd1498Szrj   if (!ref->write_p
1227*38fd1498Szrj       || !ref->independent_p
1228*38fd1498Szrj       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1229*38fd1498Szrj     return false;
1230*38fd1498Szrj 
1231*38fd1498Szrj   /* Check that we have the storent instruction for the mode.  */
1232*38fd1498Szrj   mode = TYPE_MODE (TREE_TYPE (ref->mem));
1233*38fd1498Szrj   if (mode == BLKmode)
1234*38fd1498Szrj     return false;
1235*38fd1498Szrj 
1236*38fd1498Szrj   code = optab_handler (storent_optab, mode);
1237*38fd1498Szrj   return code != CODE_FOR_nothing;
1238*38fd1498Szrj }
1239*38fd1498Szrj 
1240*38fd1498Szrj /* If REF is a nontemporal store, we mark the corresponding modify statement
1241*38fd1498Szrj    and return true.  Otherwise, we return false.  */
1242*38fd1498Szrj 
1243*38fd1498Szrj static bool
mark_nontemporal_store(struct mem_ref * ref)1244*38fd1498Szrj mark_nontemporal_store (struct mem_ref *ref)
1245*38fd1498Szrj {
1246*38fd1498Szrj   if (!nontemporal_store_p (ref))
1247*38fd1498Szrj     return false;
1248*38fd1498Szrj 
1249*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1250*38fd1498Szrj     fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1251*38fd1498Szrj 	     ref->group->uid, ref->uid);
1252*38fd1498Szrj 
1253*38fd1498Szrj   gimple_assign_set_nontemporal_move (ref->stmt, true);
1254*38fd1498Szrj   ref->storent_p = true;
1255*38fd1498Szrj 
1256*38fd1498Szrj   return true;
1257*38fd1498Szrj }
1258*38fd1498Szrj 
1259*38fd1498Szrj /* Issue a memory fence instruction after LOOP.  */
1260*38fd1498Szrj 
1261*38fd1498Szrj static void
emit_mfence_after_loop(struct loop * loop)1262*38fd1498Szrj emit_mfence_after_loop (struct loop *loop)
1263*38fd1498Szrj {
1264*38fd1498Szrj   vec<edge> exits = get_loop_exit_edges (loop);
1265*38fd1498Szrj   edge exit;
1266*38fd1498Szrj   gcall *call;
1267*38fd1498Szrj   gimple_stmt_iterator bsi;
1268*38fd1498Szrj   unsigned i;
1269*38fd1498Szrj 
1270*38fd1498Szrj   FOR_EACH_VEC_ELT (exits, i, exit)
1271*38fd1498Szrj     {
1272*38fd1498Szrj       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1273*38fd1498Szrj 
1274*38fd1498Szrj       if (!single_pred_p (exit->dest)
1275*38fd1498Szrj 	  /* If possible, we prefer not to insert the fence on other paths
1276*38fd1498Szrj 	     in cfg.  */
1277*38fd1498Szrj 	  && !(exit->flags & EDGE_ABNORMAL))
1278*38fd1498Szrj 	split_loop_exit_edge (exit);
1279*38fd1498Szrj       bsi = gsi_after_labels (exit->dest);
1280*38fd1498Szrj 
1281*38fd1498Szrj       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1282*38fd1498Szrj     }
1283*38fd1498Szrj 
1284*38fd1498Szrj   exits.release ();
1285*38fd1498Szrj   update_ssa (TODO_update_ssa_only_virtuals);
1286*38fd1498Szrj }
1287*38fd1498Szrj 
1288*38fd1498Szrj /* Returns true if we can use storent in loop, false otherwise.  */
1289*38fd1498Szrj 
1290*38fd1498Szrj static bool
may_use_storent_in_loop_p(struct loop * loop)1291*38fd1498Szrj may_use_storent_in_loop_p (struct loop *loop)
1292*38fd1498Szrj {
1293*38fd1498Szrj   bool ret = true;
1294*38fd1498Szrj 
1295*38fd1498Szrj   if (loop->inner != NULL)
1296*38fd1498Szrj     return false;
1297*38fd1498Szrj 
1298*38fd1498Szrj   /* If we must issue a mfence insn after using storent, check that there
1299*38fd1498Szrj      is a suitable place for it at each of the loop exits.  */
1300*38fd1498Szrj   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1301*38fd1498Szrj     {
1302*38fd1498Szrj       vec<edge> exits = get_loop_exit_edges (loop);
1303*38fd1498Szrj       unsigned i;
1304*38fd1498Szrj       edge exit;
1305*38fd1498Szrj 
1306*38fd1498Szrj       FOR_EACH_VEC_ELT (exits, i, exit)
1307*38fd1498Szrj 	if ((exit->flags & EDGE_ABNORMAL)
1308*38fd1498Szrj 	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1309*38fd1498Szrj 	  ret = false;
1310*38fd1498Szrj 
1311*38fd1498Szrj       exits.release ();
1312*38fd1498Szrj     }
1313*38fd1498Szrj 
1314*38fd1498Szrj   return ret;
1315*38fd1498Szrj }
1316*38fd1498Szrj 
1317*38fd1498Szrj /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1318*38fd1498Szrj    references in the loop.  */
1319*38fd1498Szrj 
1320*38fd1498Szrj static void
mark_nontemporal_stores(struct loop * loop,struct mem_ref_group * groups)1321*38fd1498Szrj mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1322*38fd1498Szrj {
1323*38fd1498Szrj   struct mem_ref *ref;
1324*38fd1498Szrj   bool any = false;
1325*38fd1498Szrj 
1326*38fd1498Szrj   if (!may_use_storent_in_loop_p (loop))
1327*38fd1498Szrj     return;
1328*38fd1498Szrj 
1329*38fd1498Szrj   for (; groups; groups = groups->next)
1330*38fd1498Szrj     for (ref = groups->refs; ref; ref = ref->next)
1331*38fd1498Szrj       any |= mark_nontemporal_store (ref);
1332*38fd1498Szrj 
1333*38fd1498Szrj   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1334*38fd1498Szrj     emit_mfence_after_loop (loop);
1335*38fd1498Szrj }
1336*38fd1498Szrj 
1337*38fd1498Szrj /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1338*38fd1498Szrj    this is the case, fill in DESC by the description of number of
1339*38fd1498Szrj    iterations.  */
1340*38fd1498Szrj 
1341*38fd1498Szrj static bool
should_unroll_loop_p(struct loop * loop,struct tree_niter_desc * desc,unsigned factor)1342*38fd1498Szrj should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1343*38fd1498Szrj 		      unsigned factor)
1344*38fd1498Szrj {
1345*38fd1498Szrj   if (!can_unroll_loop_p (loop, factor, desc))
1346*38fd1498Szrj     return false;
1347*38fd1498Szrj 
1348*38fd1498Szrj   /* We only consider loops without control flow for unrolling.  This is not
1349*38fd1498Szrj      a hard restriction -- tree_unroll_loop works with arbitrary loops
1350*38fd1498Szrj      as well; but the unrolling/prefetching is usually more profitable for
1351*38fd1498Szrj      loops consisting of a single basic block, and we want to limit the
1352*38fd1498Szrj      code growth.  */
1353*38fd1498Szrj   if (loop->num_nodes > 2)
1354*38fd1498Szrj     return false;
1355*38fd1498Szrj 
1356*38fd1498Szrj   return true;
1357*38fd1498Szrj }
1358*38fd1498Szrj 
1359*38fd1498Szrj /* Determine the coefficient by that unroll LOOP, from the information
1360*38fd1498Szrj    contained in the list of memory references REFS.  Description of
1361*38fd1498Szrj    number of iterations of LOOP is stored to DESC.  NINSNS is the number of
1362*38fd1498Szrj    insns of the LOOP.  EST_NITER is the estimated number of iterations of
1363*38fd1498Szrj    the loop, or -1 if no estimate is available.  */
1364*38fd1498Szrj 
1365*38fd1498Szrj static unsigned
determine_unroll_factor(struct loop * loop,struct mem_ref_group * refs,unsigned ninsns,struct tree_niter_desc * desc,HOST_WIDE_INT est_niter)1366*38fd1498Szrj determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1367*38fd1498Szrj 			 unsigned ninsns, struct tree_niter_desc *desc,
1368*38fd1498Szrj 			 HOST_WIDE_INT est_niter)
1369*38fd1498Szrj {
1370*38fd1498Szrj   unsigned upper_bound;
1371*38fd1498Szrj   unsigned nfactor, factor, mod_constraint;
1372*38fd1498Szrj   struct mem_ref_group *agp;
1373*38fd1498Szrj   struct mem_ref *ref;
1374*38fd1498Szrj 
1375*38fd1498Szrj   /* First check whether the loop is not too large to unroll.  We ignore
1376*38fd1498Szrj      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1377*38fd1498Szrj      from unrolling them enough to make exactly one cache line covered by each
1378*38fd1498Szrj      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1379*38fd1498Szrj      us from unrolling the loops too many times in cases where we only expect
1380*38fd1498Szrj      gains from better scheduling and decreasing loop overhead, which is not
1381*38fd1498Szrj      the case here.  */
1382*38fd1498Szrj   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1383*38fd1498Szrj 
1384*38fd1498Szrj   /* If we unrolled the loop more times than it iterates, the unrolled version
1385*38fd1498Szrj      of the loop would be never entered.  */
1386*38fd1498Szrj   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1387*38fd1498Szrj     upper_bound = est_niter;
1388*38fd1498Szrj 
1389*38fd1498Szrj   if (upper_bound <= 1)
1390*38fd1498Szrj     return 1;
1391*38fd1498Szrj 
1392*38fd1498Szrj   /* Choose the factor so that we may prefetch each cache just once,
1393*38fd1498Szrj      but bound the unrolling by UPPER_BOUND.  */
1394*38fd1498Szrj   factor = 1;
1395*38fd1498Szrj   for (agp = refs; agp; agp = agp->next)
1396*38fd1498Szrj     for (ref = agp->refs; ref; ref = ref->next)
1397*38fd1498Szrj       if (should_issue_prefetch_p (ref))
1398*38fd1498Szrj 	{
1399*38fd1498Szrj 	  mod_constraint = ref->prefetch_mod;
1400*38fd1498Szrj 	  nfactor = least_common_multiple (mod_constraint, factor);
1401*38fd1498Szrj 	  if (nfactor <= upper_bound)
1402*38fd1498Szrj 	    factor = nfactor;
1403*38fd1498Szrj 	}
1404*38fd1498Szrj 
1405*38fd1498Szrj   if (!should_unroll_loop_p (loop, desc, factor))
1406*38fd1498Szrj     return 1;
1407*38fd1498Szrj 
1408*38fd1498Szrj   return factor;
1409*38fd1498Szrj }
1410*38fd1498Szrj 
1411*38fd1498Szrj /* Returns the total volume of the memory references REFS, taking into account
1412*38fd1498Szrj    reuses in the innermost loop and cache line size.  TODO -- we should also
1413*38fd1498Szrj    take into account reuses across the iterations of the loops in the loop
1414*38fd1498Szrj    nest.  */
1415*38fd1498Szrj 
1416*38fd1498Szrj static unsigned
volume_of_references(struct mem_ref_group * refs)1417*38fd1498Szrj volume_of_references (struct mem_ref_group *refs)
1418*38fd1498Szrj {
1419*38fd1498Szrj   unsigned volume = 0;
1420*38fd1498Szrj   struct mem_ref_group *gr;
1421*38fd1498Szrj   struct mem_ref *ref;
1422*38fd1498Szrj 
1423*38fd1498Szrj   for (gr = refs; gr; gr = gr->next)
1424*38fd1498Szrj     for (ref = gr->refs; ref; ref = ref->next)
1425*38fd1498Szrj       {
1426*38fd1498Szrj 	/* Almost always reuses another value?  */
1427*38fd1498Szrj 	if (ref->prefetch_before != PREFETCH_ALL)
1428*38fd1498Szrj 	  continue;
1429*38fd1498Szrj 
1430*38fd1498Szrj 	/* If several iterations access the same cache line, use the size of
1431*38fd1498Szrj 	   the line divided by this number.  Otherwise, a cache line is
1432*38fd1498Szrj 	   accessed in each iteration.  TODO -- in the latter case, we should
1433*38fd1498Szrj 	   take the size of the reference into account, rounding it up on cache
1434*38fd1498Szrj 	   line size multiple.  */
1435*38fd1498Szrj 	volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1436*38fd1498Szrj       }
1437*38fd1498Szrj   return volume;
1438*38fd1498Szrj }
1439*38fd1498Szrj 
1440*38fd1498Szrj /* Returns the volume of memory references accessed across VEC iterations of
1441*38fd1498Szrj    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1442*38fd1498Szrj    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1443*38fd1498Szrj 
1444*38fd1498Szrj static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)1445*38fd1498Szrj volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1446*38fd1498Szrj {
1447*38fd1498Szrj   unsigned i;
1448*38fd1498Szrj 
1449*38fd1498Szrj   for (i = 0; i < n; i++)
1450*38fd1498Szrj     if (vec[i] != 0)
1451*38fd1498Szrj       break;
1452*38fd1498Szrj 
1453*38fd1498Szrj   if (i == n)
1454*38fd1498Szrj     return 0;
1455*38fd1498Szrj 
1456*38fd1498Szrj   gcc_assert (vec[i] > 0);
1457*38fd1498Szrj 
1458*38fd1498Szrj   /* We ignore the parts of the distance vector in subloops, since usually
1459*38fd1498Szrj      the numbers of iterations are much smaller.  */
1460*38fd1498Szrj   return loop_sizes[i] * vec[i];
1461*38fd1498Szrj }
1462*38fd1498Szrj 
1463*38fd1498Szrj /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1464*38fd1498Szrj    at the position corresponding to the loop of the step.  N is the depth
1465*38fd1498Szrj    of the considered loop nest, and, LOOP is its innermost loop.  */
1466*38fd1498Szrj 
1467*38fd1498Szrj static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,struct loop * loop)1468*38fd1498Szrj add_subscript_strides (tree access_fn, unsigned stride,
1469*38fd1498Szrj 		       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1470*38fd1498Szrj {
1471*38fd1498Szrj   struct loop *aloop;
1472*38fd1498Szrj   tree step;
1473*38fd1498Szrj   HOST_WIDE_INT astep;
1474*38fd1498Szrj   unsigned min_depth = loop_depth (loop) - n;
1475*38fd1498Szrj 
1476*38fd1498Szrj   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1477*38fd1498Szrj     {
1478*38fd1498Szrj       aloop = get_chrec_loop (access_fn);
1479*38fd1498Szrj       step = CHREC_RIGHT (access_fn);
1480*38fd1498Szrj       access_fn = CHREC_LEFT (access_fn);
1481*38fd1498Szrj 
1482*38fd1498Szrj       if ((unsigned) loop_depth (aloop) <= min_depth)
1483*38fd1498Szrj 	continue;
1484*38fd1498Szrj 
1485*38fd1498Szrj       if (tree_fits_shwi_p (step))
1486*38fd1498Szrj 	astep = tree_to_shwi (step);
1487*38fd1498Szrj       else
1488*38fd1498Szrj 	astep = L1_CACHE_LINE_SIZE;
1489*38fd1498Szrj 
1490*38fd1498Szrj       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1491*38fd1498Szrj 
1492*38fd1498Szrj     }
1493*38fd1498Szrj }
1494*38fd1498Szrj 
1495*38fd1498Szrj /* Returns the volume of memory references accessed between two consecutive
1496*38fd1498Szrj    self-reuses of the reference DR.  We consider the subscripts of DR in N
1497*38fd1498Szrj    loops, and LOOP_SIZES contains the volumes of accesses in each of the
1498*38fd1498Szrj    loops.  LOOP is the innermost loop of the current loop nest.  */
1499*38fd1498Szrj 
1500*38fd1498Szrj static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,struct loop * loop)1501*38fd1498Szrj self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1502*38fd1498Szrj 		     struct loop *loop)
1503*38fd1498Szrj {
1504*38fd1498Szrj   tree stride, access_fn;
1505*38fd1498Szrj   HOST_WIDE_INT *strides, astride;
1506*38fd1498Szrj   vec<tree> access_fns;
1507*38fd1498Szrj   tree ref = DR_REF (dr);
1508*38fd1498Szrj   unsigned i, ret = ~0u;
1509*38fd1498Szrj 
1510*38fd1498Szrj   /* In the following example:
1511*38fd1498Szrj 
1512*38fd1498Szrj      for (i = 0; i < N; i++)
1513*38fd1498Szrj        for (j = 0; j < N; j++)
1514*38fd1498Szrj          use (a[j][i]);
1515*38fd1498Szrj      the same cache line is accessed each N steps (except if the change from
1516*38fd1498Szrj      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1517*38fd1498Szrj      we cannot rely purely on the results of the data dependence analysis.
1518*38fd1498Szrj 
1519*38fd1498Szrj      Instead, we compute the stride of the reference in each loop, and consider
1520*38fd1498Szrj      the innermost loop in that the stride is less than cache size.  */
1521*38fd1498Szrj 
1522*38fd1498Szrj   strides = XCNEWVEC (HOST_WIDE_INT, n);
1523*38fd1498Szrj   access_fns = DR_ACCESS_FNS (dr);
1524*38fd1498Szrj 
1525*38fd1498Szrj   FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1526*38fd1498Szrj     {
1527*38fd1498Szrj       /* Keep track of the reference corresponding to the subscript, so that we
1528*38fd1498Szrj 	 know its stride.  */
1529*38fd1498Szrj       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1530*38fd1498Szrj 	ref = TREE_OPERAND (ref, 0);
1531*38fd1498Szrj 
1532*38fd1498Szrj       if (TREE_CODE (ref) == ARRAY_REF)
1533*38fd1498Szrj 	{
1534*38fd1498Szrj 	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1535*38fd1498Szrj 	  if (tree_fits_uhwi_p (stride))
1536*38fd1498Szrj 	    astride = tree_to_uhwi (stride);
1537*38fd1498Szrj 	  else
1538*38fd1498Szrj 	    astride = L1_CACHE_LINE_SIZE;
1539*38fd1498Szrj 
1540*38fd1498Szrj 	  ref = TREE_OPERAND (ref, 0);
1541*38fd1498Szrj 	}
1542*38fd1498Szrj       else
1543*38fd1498Szrj 	astride = 1;
1544*38fd1498Szrj 
1545*38fd1498Szrj       add_subscript_strides (access_fn, astride, strides, n, loop);
1546*38fd1498Szrj     }
1547*38fd1498Szrj 
1548*38fd1498Szrj   for (i = n; i-- > 0; )
1549*38fd1498Szrj     {
1550*38fd1498Szrj       unsigned HOST_WIDE_INT s;
1551*38fd1498Szrj 
1552*38fd1498Szrj       s = strides[i] < 0 ?  -strides[i] : strides[i];
1553*38fd1498Szrj 
1554*38fd1498Szrj       if (s < (unsigned) L1_CACHE_LINE_SIZE
1555*38fd1498Szrj 	  && (loop_sizes[i]
1556*38fd1498Szrj 	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1557*38fd1498Szrj 	{
1558*38fd1498Szrj 	  ret = loop_sizes[i];
1559*38fd1498Szrj 	  break;
1560*38fd1498Szrj 	}
1561*38fd1498Szrj     }
1562*38fd1498Szrj 
1563*38fd1498Szrj   free (strides);
1564*38fd1498Szrj   return ret;
1565*38fd1498Szrj }
1566*38fd1498Szrj 
1567*38fd1498Szrj /* Determines the distance till the first reuse of each reference in REFS
1568*38fd1498Szrj    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1569*38fd1498Szrj    memory references in the loop.  Return false if the analysis fails.  */
1570*38fd1498Szrj 
1571*38fd1498Szrj static bool
determine_loop_nest_reuse(struct loop * loop,struct mem_ref_group * refs,bool no_other_refs)1572*38fd1498Szrj determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1573*38fd1498Szrj 			   bool no_other_refs)
1574*38fd1498Szrj {
1575*38fd1498Szrj   struct loop *nest, *aloop;
1576*38fd1498Szrj   vec<data_reference_p> datarefs = vNULL;
1577*38fd1498Szrj   vec<ddr_p> dependences = vNULL;
1578*38fd1498Szrj   struct mem_ref_group *gr;
1579*38fd1498Szrj   struct mem_ref *ref, *refb;
1580*38fd1498Szrj   auto_vec<loop_p> vloops;
1581*38fd1498Szrj   unsigned *loop_data_size;
1582*38fd1498Szrj   unsigned i, j, n;
1583*38fd1498Szrj   unsigned volume, dist, adist;
1584*38fd1498Szrj   HOST_WIDE_INT vol;
1585*38fd1498Szrj   data_reference_p dr;
1586*38fd1498Szrj   ddr_p dep;
1587*38fd1498Szrj 
1588*38fd1498Szrj   if (loop->inner)
1589*38fd1498Szrj     return true;
1590*38fd1498Szrj 
1591*38fd1498Szrj   /* Find the outermost loop of the loop nest of loop (we require that
1592*38fd1498Szrj      there are no sibling loops inside the nest).  */
1593*38fd1498Szrj   nest = loop;
1594*38fd1498Szrj   while (1)
1595*38fd1498Szrj     {
1596*38fd1498Szrj       aloop = loop_outer (nest);
1597*38fd1498Szrj 
1598*38fd1498Szrj       if (aloop == current_loops->tree_root
1599*38fd1498Szrj 	  || aloop->inner->next)
1600*38fd1498Szrj 	break;
1601*38fd1498Szrj 
1602*38fd1498Szrj       nest = aloop;
1603*38fd1498Szrj     }
1604*38fd1498Szrj 
1605*38fd1498Szrj   /* For each loop, determine the amount of data accessed in each iteration.
1606*38fd1498Szrj      We use this to estimate whether the reference is evicted from the
1607*38fd1498Szrj      cache before its reuse.  */
1608*38fd1498Szrj   find_loop_nest (nest, &vloops);
1609*38fd1498Szrj   n = vloops.length ();
1610*38fd1498Szrj   loop_data_size = XNEWVEC (unsigned, n);
1611*38fd1498Szrj   volume = volume_of_references (refs);
1612*38fd1498Szrj   i = n;
1613*38fd1498Szrj   while (i-- != 0)
1614*38fd1498Szrj     {
1615*38fd1498Szrj       loop_data_size[i] = volume;
1616*38fd1498Szrj       /* Bound the volume by the L2 cache size, since above this bound,
1617*38fd1498Szrj 	 all dependence distances are equivalent.  */
1618*38fd1498Szrj       if (volume > L2_CACHE_SIZE_BYTES)
1619*38fd1498Szrj 	continue;
1620*38fd1498Szrj 
1621*38fd1498Szrj       aloop = vloops[i];
1622*38fd1498Szrj       vol = estimated_stmt_executions_int (aloop);
1623*38fd1498Szrj       if (vol == -1)
1624*38fd1498Szrj 	vol = expected_loop_iterations (aloop);
1625*38fd1498Szrj       volume *= vol;
1626*38fd1498Szrj     }
1627*38fd1498Szrj 
1628*38fd1498Szrj   /* Prepare the references in the form suitable for data dependence
1629*38fd1498Szrj      analysis.  We ignore unanalyzable data references (the results
1630*38fd1498Szrj      are used just as a heuristics to estimate temporality of the
1631*38fd1498Szrj      references, hence we do not need to worry about correctness).  */
1632*38fd1498Szrj   for (gr = refs; gr; gr = gr->next)
1633*38fd1498Szrj     for (ref = gr->refs; ref; ref = ref->next)
1634*38fd1498Szrj       {
1635*38fd1498Szrj 	dr = create_data_ref (loop_preheader_edge (nest),
1636*38fd1498Szrj 			      loop_containing_stmt (ref->stmt),
1637*38fd1498Szrj 			      ref->mem, ref->stmt, !ref->write_p, false);
1638*38fd1498Szrj 
1639*38fd1498Szrj 	if (dr)
1640*38fd1498Szrj 	  {
1641*38fd1498Szrj 	    ref->reuse_distance = volume;
1642*38fd1498Szrj 	    dr->aux = ref;
1643*38fd1498Szrj 	    datarefs.safe_push (dr);
1644*38fd1498Szrj 	  }
1645*38fd1498Szrj 	else
1646*38fd1498Szrj 	  no_other_refs = false;
1647*38fd1498Szrj       }
1648*38fd1498Szrj 
1649*38fd1498Szrj   FOR_EACH_VEC_ELT (datarefs, i, dr)
1650*38fd1498Szrj     {
1651*38fd1498Szrj       dist = self_reuse_distance (dr, loop_data_size, n, loop);
1652*38fd1498Szrj       ref = (struct mem_ref *) dr->aux;
1653*38fd1498Szrj       if (ref->reuse_distance > dist)
1654*38fd1498Szrj 	ref->reuse_distance = dist;
1655*38fd1498Szrj 
1656*38fd1498Szrj       if (no_other_refs)
1657*38fd1498Szrj 	ref->independent_p = true;
1658*38fd1498Szrj     }
1659*38fd1498Szrj 
1660*38fd1498Szrj   if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1661*38fd1498Szrj     return false;
1662*38fd1498Szrj 
1663*38fd1498Szrj   FOR_EACH_VEC_ELT (dependences, i, dep)
1664*38fd1498Szrj     {
1665*38fd1498Szrj       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1666*38fd1498Szrj 	continue;
1667*38fd1498Szrj 
1668*38fd1498Szrj       ref = (struct mem_ref *) DDR_A (dep)->aux;
1669*38fd1498Szrj       refb = (struct mem_ref *) DDR_B (dep)->aux;
1670*38fd1498Szrj 
1671*38fd1498Szrj       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1672*38fd1498Szrj 	  || DDR_COULD_BE_INDEPENDENT_P (dep)
1673*38fd1498Szrj 	  || DDR_NUM_DIST_VECTS (dep) == 0)
1674*38fd1498Szrj 	{
1675*38fd1498Szrj 	  /* If the dependence cannot be analyzed, assume that there might be
1676*38fd1498Szrj 	     a reuse.  */
1677*38fd1498Szrj 	  dist = 0;
1678*38fd1498Szrj 
1679*38fd1498Szrj 	  ref->independent_p = false;
1680*38fd1498Szrj 	  refb->independent_p = false;
1681*38fd1498Szrj 	}
1682*38fd1498Szrj       else
1683*38fd1498Szrj 	{
1684*38fd1498Szrj 	  /* The distance vectors are normalized to be always lexicographically
1685*38fd1498Szrj 	     positive, hence we cannot tell just from them whether DDR_A comes
1686*38fd1498Szrj 	     before DDR_B or vice versa.  However, it is not important,
1687*38fd1498Szrj 	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
1688*38fd1498Szrj 	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1689*38fd1498Szrj 	     in cache (and marking it as nontemporal would not affect
1690*38fd1498Szrj 	     anything).  */
1691*38fd1498Szrj 
1692*38fd1498Szrj 	  dist = volume;
1693*38fd1498Szrj 	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1694*38fd1498Szrj 	    {
1695*38fd1498Szrj 	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1696*38fd1498Szrj 					     loop_data_size, n);
1697*38fd1498Szrj 
1698*38fd1498Szrj 	      /* If this is a dependence in the innermost loop (i.e., the
1699*38fd1498Szrj 		 distances in all superloops are zero) and it is not
1700*38fd1498Szrj 		 the trivial self-dependence with distance zero, record that
1701*38fd1498Szrj 		 the references are not completely independent.  */
1702*38fd1498Szrj 	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1703*38fd1498Szrj 		  && (ref != refb
1704*38fd1498Szrj 		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1705*38fd1498Szrj 		{
1706*38fd1498Szrj 		  ref->independent_p = false;
1707*38fd1498Szrj 		  refb->independent_p = false;
1708*38fd1498Szrj 		}
1709*38fd1498Szrj 
1710*38fd1498Szrj 	      /* Ignore accesses closer than
1711*38fd1498Szrj 		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1712*38fd1498Szrj 	      	 so that we use nontemporal prefetches e.g. if single memory
1713*38fd1498Szrj 		 location is accessed several times in a single iteration of
1714*38fd1498Szrj 		 the loop.  */
1715*38fd1498Szrj 	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1716*38fd1498Szrj 		continue;
1717*38fd1498Szrj 
1718*38fd1498Szrj 	      if (adist < dist)
1719*38fd1498Szrj 		dist = adist;
1720*38fd1498Szrj 	    }
1721*38fd1498Szrj 	}
1722*38fd1498Szrj 
1723*38fd1498Szrj       if (ref->reuse_distance > dist)
1724*38fd1498Szrj 	ref->reuse_distance = dist;
1725*38fd1498Szrj       if (refb->reuse_distance > dist)
1726*38fd1498Szrj 	refb->reuse_distance = dist;
1727*38fd1498Szrj     }
1728*38fd1498Szrj 
1729*38fd1498Szrj   free_dependence_relations (dependences);
1730*38fd1498Szrj   free_data_refs (datarefs);
1731*38fd1498Szrj   free (loop_data_size);
1732*38fd1498Szrj 
1733*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1734*38fd1498Szrj     {
1735*38fd1498Szrj       fprintf (dump_file, "Reuse distances:\n");
1736*38fd1498Szrj       for (gr = refs; gr; gr = gr->next)
1737*38fd1498Szrj 	for (ref = gr->refs; ref; ref = ref->next)
1738*38fd1498Szrj 	  fprintf (dump_file, " reference %u:%u distance %u\n",
1739*38fd1498Szrj 		   ref->group->uid, ref->uid, ref->reuse_distance);
1740*38fd1498Szrj     }
1741*38fd1498Szrj 
1742*38fd1498Szrj   return true;
1743*38fd1498Szrj }
1744*38fd1498Szrj 
1745*38fd1498Szrj /* Determine whether or not the trip count to ahead ratio is too small based
1746*38fd1498Szrj    on prefitablility consideration.
1747*38fd1498Szrj    AHEAD: the iteration ahead distance,
1748*38fd1498Szrj    EST_NITER: the estimated trip count.  */
1749*38fd1498Szrj 
1750*38fd1498Szrj static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)1751*38fd1498Szrj trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1752*38fd1498Szrj {
1753*38fd1498Szrj   /* Assume trip count to ahead ratio is big enough if the trip count could not
1754*38fd1498Szrj      be estimated at compile time.  */
1755*38fd1498Szrj   if (est_niter < 0)
1756*38fd1498Szrj     return false;
1757*38fd1498Szrj 
1758*38fd1498Szrj   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1759*38fd1498Szrj     {
1760*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1761*38fd1498Szrj 	fprintf (dump_file,
1762*38fd1498Szrj 		 "Not prefetching -- loop estimated to roll only %d times\n",
1763*38fd1498Szrj 		 (int) est_niter);
1764*38fd1498Szrj       return true;
1765*38fd1498Szrj     }
1766*38fd1498Szrj 
1767*38fd1498Szrj   return false;
1768*38fd1498Szrj }
1769*38fd1498Szrj 
1770*38fd1498Szrj /* Determine whether or not the number of memory references in the loop is
1771*38fd1498Szrj    reasonable based on the profitablity and compilation time considerations.
1772*38fd1498Szrj    NINSNS: estimated number of instructions in the loop,
1773*38fd1498Szrj    MEM_REF_COUNT: total number of memory references in the loop.  */
1774*38fd1498Szrj 
1775*38fd1498Szrj static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)1776*38fd1498Szrj mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1777*38fd1498Szrj {
1778*38fd1498Szrj   int insn_to_mem_ratio;
1779*38fd1498Szrj 
1780*38fd1498Szrj   if (mem_ref_count == 0)
1781*38fd1498Szrj     return false;
1782*38fd1498Szrj 
1783*38fd1498Szrj   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1784*38fd1498Szrj      (compute_all_dependences) have high costs based on quadratic complexity.
1785*38fd1498Szrj      To avoid huge compilation time, we give up prefetching if mem_ref_count
1786*38fd1498Szrj      is too large.  */
1787*38fd1498Szrj   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1788*38fd1498Szrj     return false;
1789*38fd1498Szrj 
1790*38fd1498Szrj   /* Prefetching improves performance by overlapping cache missing
1791*38fd1498Szrj      memory accesses with CPU operations.  If the loop does not have
1792*38fd1498Szrj      enough CPU operations to overlap with memory operations, prefetching
1793*38fd1498Szrj      won't give a significant benefit.  One approximate way of checking
1794*38fd1498Szrj      this is to require the ratio of instructions to memory references to
1795*38fd1498Szrj      be above a certain limit.  This approximation works well in practice.
1796*38fd1498Szrj      TODO: Implement a more precise computation by estimating the time
1797*38fd1498Szrj      for each CPU or memory op in the loop. Time estimates for memory ops
1798*38fd1498Szrj      should account for cache misses.  */
1799*38fd1498Szrj   insn_to_mem_ratio = ninsns / mem_ref_count;
1800*38fd1498Szrj 
1801*38fd1498Szrj   if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1802*38fd1498Szrj     {
1803*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1804*38fd1498Szrj         fprintf (dump_file,
1805*38fd1498Szrj 		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1806*38fd1498Szrj 		 insn_to_mem_ratio);
1807*38fd1498Szrj       return false;
1808*38fd1498Szrj     }
1809*38fd1498Szrj 
1810*38fd1498Szrj   return true;
1811*38fd1498Szrj }
1812*38fd1498Szrj 
1813*38fd1498Szrj /* Determine whether or not the instruction to prefetch ratio in the loop is
1814*38fd1498Szrj    too small based on the profitablity consideration.
1815*38fd1498Szrj    NINSNS: estimated number of instructions in the loop,
1816*38fd1498Szrj    PREFETCH_COUNT: an estimate of the number of prefetches,
1817*38fd1498Szrj    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1818*38fd1498Szrj 
1819*38fd1498Szrj static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)1820*38fd1498Szrj insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1821*38fd1498Szrj                                      unsigned unroll_factor)
1822*38fd1498Szrj {
1823*38fd1498Szrj   int insn_to_prefetch_ratio;
1824*38fd1498Szrj 
1825*38fd1498Szrj   /* Prefetching most likely causes performance degradation when the instruction
1826*38fd1498Szrj      to prefetch ratio is too small.  Too many prefetch instructions in a loop
1827*38fd1498Szrj      may reduce the I-cache performance.
1828*38fd1498Szrj      (unroll_factor * ninsns) is used to estimate the number of instructions in
1829*38fd1498Szrj      the unrolled loop.  This implementation is a bit simplistic -- the number
1830*38fd1498Szrj      of issued prefetch instructions is also affected by unrolling.  So,
1831*38fd1498Szrj      prefetch_mod and the unroll factor should be taken into account when
1832*38fd1498Szrj      determining prefetch_count.  Also, the number of insns of the unrolled
1833*38fd1498Szrj      loop will usually be significantly smaller than the number of insns of the
1834*38fd1498Szrj      original loop * unroll_factor (at least the induction variable increases
1835*38fd1498Szrj      and the exit branches will get eliminated), so it might be better to use
1836*38fd1498Szrj      tree_estimate_loop_size + estimated_unrolled_size.  */
1837*38fd1498Szrj   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1838*38fd1498Szrj   if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1839*38fd1498Szrj     {
1840*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1841*38fd1498Szrj         fprintf (dump_file,
1842*38fd1498Szrj 		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1843*38fd1498Szrj 		 insn_to_prefetch_ratio);
1844*38fd1498Szrj       return true;
1845*38fd1498Szrj     }
1846*38fd1498Szrj 
1847*38fd1498Szrj   return false;
1848*38fd1498Szrj }
1849*38fd1498Szrj 
1850*38fd1498Szrj 
1851*38fd1498Szrj /* Issue prefetch instructions for array references in LOOP.  Returns
1852*38fd1498Szrj    true if the LOOP was unrolled.  */
1853*38fd1498Szrj 
1854*38fd1498Szrj static bool
loop_prefetch_arrays(struct loop * loop)1855*38fd1498Szrj loop_prefetch_arrays (struct loop *loop)
1856*38fd1498Szrj {
1857*38fd1498Szrj   struct mem_ref_group *refs;
1858*38fd1498Szrj   unsigned ahead, ninsns, time, unroll_factor;
1859*38fd1498Szrj   HOST_WIDE_INT est_niter;
1860*38fd1498Szrj   struct tree_niter_desc desc;
1861*38fd1498Szrj   bool unrolled = false, no_other_refs;
1862*38fd1498Szrj   unsigned prefetch_count;
1863*38fd1498Szrj   unsigned mem_ref_count;
1864*38fd1498Szrj 
1865*38fd1498Szrj   if (optimize_loop_nest_for_size_p (loop))
1866*38fd1498Szrj     {
1867*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1868*38fd1498Szrj 	fprintf (dump_file, "  ignored (cold area)\n");
1869*38fd1498Szrj       return false;
1870*38fd1498Szrj     }
1871*38fd1498Szrj 
1872*38fd1498Szrj   /* FIXME: the time should be weighted by the probabilities of the blocks in
1873*38fd1498Szrj      the loop body.  */
1874*38fd1498Szrj   time = tree_num_loop_insns (loop, &eni_time_weights);
1875*38fd1498Szrj   if (time == 0)
1876*38fd1498Szrj     return false;
1877*38fd1498Szrj 
1878*38fd1498Szrj   ahead = (PREFETCH_LATENCY + time - 1) / time;
1879*38fd1498Szrj   est_niter = estimated_stmt_executions_int (loop);
1880*38fd1498Szrj   if (est_niter == -1)
1881*38fd1498Szrj     est_niter = likely_max_stmt_executions_int (loop);
1882*38fd1498Szrj 
1883*38fd1498Szrj   /* Prefetching is not likely to be profitable if the trip count to ahead
1884*38fd1498Szrj      ratio is too small.  */
1885*38fd1498Szrj   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1886*38fd1498Szrj     return false;
1887*38fd1498Szrj 
1888*38fd1498Szrj   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1889*38fd1498Szrj 
1890*38fd1498Szrj   /* Step 1: gather the memory references.  */
1891*38fd1498Szrj   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1892*38fd1498Szrj 
1893*38fd1498Szrj   /* Give up prefetching if the number of memory references in the
1894*38fd1498Szrj      loop is not reasonable based on profitablity and compilation time
1895*38fd1498Szrj      considerations.  */
1896*38fd1498Szrj   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1897*38fd1498Szrj     goto fail;
1898*38fd1498Szrj 
1899*38fd1498Szrj   /* Step 2: estimate the reuse effects.  */
1900*38fd1498Szrj   prune_by_reuse (refs);
1901*38fd1498Szrj 
1902*38fd1498Szrj   if (nothing_to_prefetch_p (refs))
1903*38fd1498Szrj     goto fail;
1904*38fd1498Szrj 
1905*38fd1498Szrj   if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1906*38fd1498Szrj     goto fail;
1907*38fd1498Szrj 
1908*38fd1498Szrj   /* Step 3: determine unroll factor.  */
1909*38fd1498Szrj   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1910*38fd1498Szrj 					   est_niter);
1911*38fd1498Szrj 
1912*38fd1498Szrj   /* Estimate prefetch count for the unrolled loop.  */
1913*38fd1498Szrj   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1914*38fd1498Szrj   if (prefetch_count == 0)
1915*38fd1498Szrj     goto fail;
1916*38fd1498Szrj 
1917*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1918*38fd1498Szrj     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1919*38fd1498Szrj 	     HOST_WIDE_INT_PRINT_DEC "\n"
1920*38fd1498Szrj 	     "insn count %d, mem ref count %d, prefetch count %d\n",
1921*38fd1498Szrj 	     ahead, unroll_factor, est_niter,
1922*38fd1498Szrj 	     ninsns, mem_ref_count, prefetch_count);
1923*38fd1498Szrj 
1924*38fd1498Szrj   /* Prefetching is not likely to be profitable if the instruction to prefetch
1925*38fd1498Szrj      ratio is too small.  */
1926*38fd1498Szrj   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1927*38fd1498Szrj 					  unroll_factor))
1928*38fd1498Szrj     goto fail;
1929*38fd1498Szrj 
1930*38fd1498Szrj   mark_nontemporal_stores (loop, refs);
1931*38fd1498Szrj 
1932*38fd1498Szrj   /* Step 4: what to prefetch?  */
1933*38fd1498Szrj   if (!schedule_prefetches (refs, unroll_factor, ahead))
1934*38fd1498Szrj     goto fail;
1935*38fd1498Szrj 
1936*38fd1498Szrj   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1937*38fd1498Szrj      iterations so that we do not issue superfluous prefetches.  */
1938*38fd1498Szrj   if (unroll_factor != 1)
1939*38fd1498Szrj     {
1940*38fd1498Szrj       tree_unroll_loop (loop, unroll_factor,
1941*38fd1498Szrj 			single_dom_exit (loop), &desc);
1942*38fd1498Szrj       unrolled = true;
1943*38fd1498Szrj     }
1944*38fd1498Szrj 
1945*38fd1498Szrj   /* Step 6: issue the prefetches.  */
1946*38fd1498Szrj   issue_prefetches (refs, unroll_factor, ahead);
1947*38fd1498Szrj 
1948*38fd1498Szrj fail:
1949*38fd1498Szrj   release_mem_refs (refs);
1950*38fd1498Szrj   return unrolled;
1951*38fd1498Szrj }
1952*38fd1498Szrj 
1953*38fd1498Szrj /* Issue prefetch instructions for array references in loops.  */
1954*38fd1498Szrj 
1955*38fd1498Szrj unsigned int
tree_ssa_prefetch_arrays(void)1956*38fd1498Szrj tree_ssa_prefetch_arrays (void)
1957*38fd1498Szrj {
1958*38fd1498Szrj   struct loop *loop;
1959*38fd1498Szrj   bool unrolled = false;
1960*38fd1498Szrj   int todo_flags = 0;
1961*38fd1498Szrj 
1962*38fd1498Szrj   if (!targetm.have_prefetch ()
1963*38fd1498Szrj       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1964*38fd1498Szrj 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1965*38fd1498Szrj 	 of processor costs and i486 does not have prefetch, but
1966*38fd1498Szrj 	 -march=pentium4 causes targetm.have_prefetch to be true.  Ugh.  */
1967*38fd1498Szrj       || PREFETCH_BLOCK == 0)
1968*38fd1498Szrj     return 0;
1969*38fd1498Szrj 
1970*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1971*38fd1498Szrj     {
1972*38fd1498Szrj       fprintf (dump_file, "Prefetching parameters:\n");
1973*38fd1498Szrj       fprintf (dump_file, "    simultaneous prefetches: %d\n",
1974*38fd1498Szrj 	       SIMULTANEOUS_PREFETCHES);
1975*38fd1498Szrj       fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1976*38fd1498Szrj       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1977*38fd1498Szrj       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1978*38fd1498Szrj 	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1979*38fd1498Szrj       fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1980*38fd1498Szrj       fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1981*38fd1498Szrj       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1982*38fd1498Szrj 	       MIN_INSN_TO_PREFETCH_RATIO);
1983*38fd1498Szrj       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1984*38fd1498Szrj 	       PREFETCH_MIN_INSN_TO_MEM_RATIO);
1985*38fd1498Szrj       fprintf (dump_file, "\n");
1986*38fd1498Szrj     }
1987*38fd1498Szrj 
1988*38fd1498Szrj   initialize_original_copy_tables ();
1989*38fd1498Szrj 
1990*38fd1498Szrj   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1991*38fd1498Szrj     {
1992*38fd1498Szrj       tree type = build_function_type_list (void_type_node,
1993*38fd1498Szrj 					    const_ptr_type_node, NULL_TREE);
1994*38fd1498Szrj       tree decl = add_builtin_function ("__builtin_prefetch", type,
1995*38fd1498Szrj 					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1996*38fd1498Szrj 					NULL, NULL_TREE);
1997*38fd1498Szrj       DECL_IS_NOVOPS (decl) = true;
1998*38fd1498Szrj       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1999*38fd1498Szrj     }
2000*38fd1498Szrj 
2001*38fd1498Szrj   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
2002*38fd1498Szrj     {
2003*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
2004*38fd1498Szrj 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
2005*38fd1498Szrj 
2006*38fd1498Szrj       unrolled |= loop_prefetch_arrays (loop);
2007*38fd1498Szrj 
2008*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
2009*38fd1498Szrj 	fprintf (dump_file, "\n\n");
2010*38fd1498Szrj     }
2011*38fd1498Szrj 
2012*38fd1498Szrj   if (unrolled)
2013*38fd1498Szrj     {
2014*38fd1498Szrj       scev_reset ();
2015*38fd1498Szrj       todo_flags |= TODO_cleanup_cfg;
2016*38fd1498Szrj     }
2017*38fd1498Szrj 
2018*38fd1498Szrj   free_original_copy_tables ();
2019*38fd1498Szrj   return todo_flags;
2020*38fd1498Szrj }
2021*38fd1498Szrj 
2022*38fd1498Szrj /* Prefetching.  */
2023*38fd1498Szrj 
2024*38fd1498Szrj namespace {
2025*38fd1498Szrj 
2026*38fd1498Szrj const pass_data pass_data_loop_prefetch =
2027*38fd1498Szrj {
2028*38fd1498Szrj   GIMPLE_PASS, /* type */
2029*38fd1498Szrj   "aprefetch", /* name */
2030*38fd1498Szrj   OPTGROUP_LOOP, /* optinfo_flags */
2031*38fd1498Szrj   TV_TREE_PREFETCH, /* tv_id */
2032*38fd1498Szrj   ( PROP_cfg | PROP_ssa ), /* properties_required */
2033*38fd1498Szrj   0, /* properties_provided */
2034*38fd1498Szrj   0, /* properties_destroyed */
2035*38fd1498Szrj   0, /* todo_flags_start */
2036*38fd1498Szrj   0, /* todo_flags_finish */
2037*38fd1498Szrj };
2038*38fd1498Szrj 
2039*38fd1498Szrj class pass_loop_prefetch : public gimple_opt_pass
2040*38fd1498Szrj {
2041*38fd1498Szrj public:
pass_loop_prefetch(gcc::context * ctxt)2042*38fd1498Szrj   pass_loop_prefetch (gcc::context *ctxt)
2043*38fd1498Szrj     : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2044*38fd1498Szrj   {}
2045*38fd1498Szrj 
2046*38fd1498Szrj   /* opt_pass methods: */
gate(function *)2047*38fd1498Szrj   virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2048*38fd1498Szrj   virtual unsigned int execute (function *);
2049*38fd1498Szrj 
2050*38fd1498Szrj }; // class pass_loop_prefetch
2051*38fd1498Szrj 
2052*38fd1498Szrj unsigned int
execute(function * fun)2053*38fd1498Szrj pass_loop_prefetch::execute (function *fun)
2054*38fd1498Szrj {
2055*38fd1498Szrj   if (number_of_loops (fun) <= 1)
2056*38fd1498Szrj     return 0;
2057*38fd1498Szrj 
2058*38fd1498Szrj   if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2059*38fd1498Szrj     {
2060*38fd1498Szrj       static bool warned = false;
2061*38fd1498Szrj 
2062*38fd1498Szrj       if (!warned)
2063*38fd1498Szrj 	{
2064*38fd1498Szrj 	  warning (OPT_Wdisabled_optimization,
2065*38fd1498Szrj 		   "%<l1-cache-size%> parameter is not a power of two %d",
2066*38fd1498Szrj 		   PREFETCH_BLOCK);
2067*38fd1498Szrj 	  warned = true;
2068*38fd1498Szrj 	}
2069*38fd1498Szrj       return 0;
2070*38fd1498Szrj     }
2071*38fd1498Szrj 
2072*38fd1498Szrj   return tree_ssa_prefetch_arrays ();
2073*38fd1498Szrj }
2074*38fd1498Szrj 
2075*38fd1498Szrj } // anon namespace
2076*38fd1498Szrj 
2077*38fd1498Szrj gimple_opt_pass *
make_pass_loop_prefetch(gcc::context * ctxt)2078*38fd1498Szrj make_pass_loop_prefetch (gcc::context *ctxt)
2079*38fd1498Szrj {
2080*38fd1498Szrj   return new pass_loop_prefetch (ctxt);
2081*38fd1498Szrj }
2082*38fd1498Szrj 
2083*38fd1498Szrj 
2084