1*38fd1498Szrj /* Array prefetching.
2*38fd1498Szrj Copyright (C) 2005-2018 Free Software Foundation, Inc.
3*38fd1498Szrj
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it
7*38fd1498Szrj under the terms of the GNU General Public License as published by the
8*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
9*38fd1498Szrj later version.
10*38fd1498Szrj
11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
12*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14*38fd1498Szrj for more details.
15*38fd1498Szrj
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3. If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>. */
19*38fd1498Szrj
20*38fd1498Szrj #include "config.h"
21*38fd1498Szrj #include "system.h"
22*38fd1498Szrj #include "coretypes.h"
23*38fd1498Szrj #include "backend.h"
24*38fd1498Szrj #include "target.h"
25*38fd1498Szrj #include "rtl.h"
26*38fd1498Szrj #include "tree.h"
27*38fd1498Szrj #include "gimple.h"
28*38fd1498Szrj #include "predict.h"
29*38fd1498Szrj #include "tree-pass.h"
30*38fd1498Szrj #include "gimple-ssa.h"
31*38fd1498Szrj #include "optabs-query.h"
32*38fd1498Szrj #include "tree-pretty-print.h"
33*38fd1498Szrj #include "fold-const.h"
34*38fd1498Szrj #include "stor-layout.h"
35*38fd1498Szrj #include "gimplify.h"
36*38fd1498Szrj #include "gimple-iterator.h"
37*38fd1498Szrj #include "gimplify-me.h"
38*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
39*38fd1498Szrj #include "tree-ssa-loop-manip.h"
40*38fd1498Szrj #include "tree-ssa-loop-niter.h"
41*38fd1498Szrj #include "tree-ssa-loop.h"
42*38fd1498Szrj #include "ssa.h"
43*38fd1498Szrj #include "tree-into-ssa.h"
44*38fd1498Szrj #include "cfgloop.h"
45*38fd1498Szrj #include "tree-scalar-evolution.h"
46*38fd1498Szrj #include "params.h"
47*38fd1498Szrj #include "langhooks.h"
48*38fd1498Szrj #include "tree-inline.h"
49*38fd1498Szrj #include "tree-data-ref.h"
50*38fd1498Szrj #include "diagnostic-core.h"
51*38fd1498Szrj #include "dbgcnt.h"
52*38fd1498Szrj
53*38fd1498Szrj /* This pass inserts prefetch instructions to optimize cache usage during
54*38fd1498Szrj accesses to arrays in loops. It processes loops sequentially and:
55*38fd1498Szrj
56*38fd1498Szrj 1) Gathers all memory references in the single loop.
57*38fd1498Szrj 2) For each of the references it decides when it is profitable to prefetch
58*38fd1498Szrj it. To do it, we evaluate the reuse among the accesses, and determines
59*38fd1498Szrj two values: PREFETCH_BEFORE (meaning that it only makes sense to do
60*38fd1498Szrj prefetching in the first PREFETCH_BEFORE iterations of the loop) and
61*38fd1498Szrj PREFETCH_MOD (meaning that it only makes sense to prefetch in the
62*38fd1498Szrj iterations of the loop that are zero modulo PREFETCH_MOD). For example
63*38fd1498Szrj (assuming cache line size is 64 bytes, char has size 1 byte and there
64*38fd1498Szrj is no hardware sequential prefetch):
65*38fd1498Szrj
66*38fd1498Szrj char *a;
67*38fd1498Szrj for (i = 0; i < max; i++)
68*38fd1498Szrj {
69*38fd1498Szrj a[255] = ...; (0)
70*38fd1498Szrj a[i] = ...; (1)
71*38fd1498Szrj a[i + 64] = ...; (2)
72*38fd1498Szrj a[16*i] = ...; (3)
73*38fd1498Szrj a[187*i] = ...; (4)
74*38fd1498Szrj a[187*i + 50] = ...; (5)
75*38fd1498Szrj }
76*38fd1498Szrj
77*38fd1498Szrj (0) obviously has PREFETCH_BEFORE 1
78*38fd1498Szrj (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
79*38fd1498Szrj location 64 iterations before it, and PREFETCH_MOD 64 (since
80*38fd1498Szrj it hits the same cache line otherwise).
81*38fd1498Szrj (2) has PREFETCH_MOD 64
82*38fd1498Szrj (3) has PREFETCH_MOD 4
83*38fd1498Szrj (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
84*38fd1498Szrj the cache line accessed by (5) is the same with probability only
85*38fd1498Szrj 7/32.
86*38fd1498Szrj (5) has PREFETCH_MOD 1 as well.
87*38fd1498Szrj
88*38fd1498Szrj Additionally, we use data dependence analysis to determine for each
89*38fd1498Szrj reference the distance till the first reuse; this information is used
90*38fd1498Szrj to determine the temporality of the issued prefetch instruction.
91*38fd1498Szrj
92*38fd1498Szrj 3) We determine how much ahead we need to prefetch. The number of
93*38fd1498Szrj iterations needed is time to fetch / time spent in one iteration of
94*38fd1498Szrj the loop. The problem is that we do not know either of these values,
95*38fd1498Szrj so we just make a heuristic guess based on a magic (possibly)
96*38fd1498Szrj target-specific constant and size of the loop.
97*38fd1498Szrj
98*38fd1498Szrj 4) Determine which of the references we prefetch. We take into account
99*38fd1498Szrj that there is a maximum number of simultaneous prefetches (provided
100*38fd1498Szrj by machine description). We prefetch as many prefetches as possible
101*38fd1498Szrj while still within this bound (starting with those with lowest
102*38fd1498Szrj prefetch_mod, since they are responsible for most of the cache
103*38fd1498Szrj misses).
104*38fd1498Szrj
105*38fd1498Szrj 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
106*38fd1498Szrj and PREFETCH_BEFORE requirements (within some bounds), and to avoid
107*38fd1498Szrj prefetching nonaccessed memory.
108*38fd1498Szrj TODO -- actually implement peeling.
109*38fd1498Szrj
110*38fd1498Szrj 6) We actually emit the prefetch instructions. ??? Perhaps emit the
111*38fd1498Szrj prefetch instructions with guards in cases where 5) was not sufficient
112*38fd1498Szrj to satisfy the constraints?
113*38fd1498Szrj
114*38fd1498Szrj A cost model is implemented to determine whether or not prefetching is
115*38fd1498Szrj profitable for a given loop. The cost model has three heuristics:
116*38fd1498Szrj
117*38fd1498Szrj 1. Function trip_count_to_ahead_ratio_too_small_p implements a
118*38fd1498Szrj heuristic that determines whether or not the loop has too few
119*38fd1498Szrj iterations (compared to ahead). Prefetching is not likely to be
120*38fd1498Szrj beneficial if the trip count to ahead ratio is below a certain
121*38fd1498Szrj minimum.
122*38fd1498Szrj
123*38fd1498Szrj 2. Function mem_ref_count_reasonable_p implements a heuristic that
124*38fd1498Szrj determines whether the given loop has enough CPU ops that can be
125*38fd1498Szrj overlapped with cache missing memory ops. If not, the loop
126*38fd1498Szrj won't benefit from prefetching. In the implementation,
127*38fd1498Szrj prefetching is not considered beneficial if the ratio between
128*38fd1498Szrj the instruction count and the mem ref count is below a certain
129*38fd1498Szrj minimum.
130*38fd1498Szrj
131*38fd1498Szrj 3. Function insn_to_prefetch_ratio_too_small_p implements a
132*38fd1498Szrj heuristic that disables prefetching in a loop if the prefetching
133*38fd1498Szrj cost is above a certain limit. The relative prefetching cost is
134*38fd1498Szrj estimated by taking the ratio between the prefetch count and the
135*38fd1498Szrj total intruction count (this models the I-cache cost).
136*38fd1498Szrj
137*38fd1498Szrj The limits used in these heuristics are defined as parameters with
138*38fd1498Szrj reasonable default values. Machine-specific default values will be
139*38fd1498Szrj added later.
140*38fd1498Szrj
141*38fd1498Szrj Some other TODO:
142*38fd1498Szrj -- write and use more general reuse analysis (that could be also used
143*38fd1498Szrj in other cache aimed loop optimizations)
144*38fd1498Szrj -- make it behave sanely together with the prefetches given by user
145*38fd1498Szrj (now we just ignore them; at the very least we should avoid
146*38fd1498Szrj optimizing loops in that user put his own prefetches)
147*38fd1498Szrj -- we assume cache line size alignment of arrays; this could be
148*38fd1498Szrj improved. */
149*38fd1498Szrj
150*38fd1498Szrj /* Magic constants follow. These should be replaced by machine specific
151*38fd1498Szrj numbers. */
152*38fd1498Szrj
153*38fd1498Szrj /* True if write can be prefetched by a read prefetch. */
154*38fd1498Szrj
155*38fd1498Szrj #ifndef WRITE_CAN_USE_READ_PREFETCH
156*38fd1498Szrj #define WRITE_CAN_USE_READ_PREFETCH 1
157*38fd1498Szrj #endif
158*38fd1498Szrj
159*38fd1498Szrj /* True if read can be prefetched by a write prefetch. */
160*38fd1498Szrj
161*38fd1498Szrj #ifndef READ_CAN_USE_WRITE_PREFETCH
162*38fd1498Szrj #define READ_CAN_USE_WRITE_PREFETCH 0
163*38fd1498Szrj #endif
164*38fd1498Szrj
165*38fd1498Szrj /* The size of the block loaded by a single prefetch. Usually, this is
166*38fd1498Szrj the same as cache line size (at the moment, we only consider one level
167*38fd1498Szrj of cache hierarchy). */
168*38fd1498Szrj
169*38fd1498Szrj #ifndef PREFETCH_BLOCK
170*38fd1498Szrj #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
171*38fd1498Szrj #endif
172*38fd1498Szrj
173*38fd1498Szrj /* Do we have a forward hardware sequential prefetching? */
174*38fd1498Szrj
175*38fd1498Szrj #ifndef HAVE_FORWARD_PREFETCH
176*38fd1498Szrj #define HAVE_FORWARD_PREFETCH 0
177*38fd1498Szrj #endif
178*38fd1498Szrj
179*38fd1498Szrj /* Do we have a backward hardware sequential prefetching? */
180*38fd1498Szrj
181*38fd1498Szrj #ifndef HAVE_BACKWARD_PREFETCH
182*38fd1498Szrj #define HAVE_BACKWARD_PREFETCH 0
183*38fd1498Szrj #endif
184*38fd1498Szrj
185*38fd1498Szrj /* In some cases we are only able to determine that there is a certain
186*38fd1498Szrj probability that the two accesses hit the same cache line. In this
187*38fd1498Szrj case, we issue the prefetches for both of them if this probability
188*38fd1498Szrj is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
189*38fd1498Szrj
190*38fd1498Szrj #ifndef ACCEPTABLE_MISS_RATE
191*38fd1498Szrj #define ACCEPTABLE_MISS_RATE 50
192*38fd1498Szrj #endif
193*38fd1498Szrj
194*38fd1498Szrj #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
195*38fd1498Szrj #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
196*38fd1498Szrj
197*38fd1498Szrj /* We consider a memory access nontemporal if it is not reused sooner than
198*38fd1498Szrj after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
199*38fd1498Szrj accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
200*38fd1498Szrj so that we use nontemporal prefetches e.g. if single memory location
201*38fd1498Szrj is accessed several times in a single iteration of the loop. */
202*38fd1498Szrj #define NONTEMPORAL_FRACTION 16
203*38fd1498Szrj
204*38fd1498Szrj /* In case we have to emit a memory fence instruction after the loop that
205*38fd1498Szrj uses nontemporal stores, this defines the builtin to use. */
206*38fd1498Szrj
207*38fd1498Szrj #ifndef FENCE_FOLLOWING_MOVNT
208*38fd1498Szrj #define FENCE_FOLLOWING_MOVNT NULL_TREE
209*38fd1498Szrj #endif
210*38fd1498Szrj
211*38fd1498Szrj /* It is not profitable to prefetch when the trip count is not at
212*38fd1498Szrj least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
213*38fd1498Szrj For example, in a loop with a prefetch ahead distance of 10,
214*38fd1498Szrj supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
215*38fd1498Szrj profitable to prefetch when the trip count is greater or equal to
216*38fd1498Szrj 40. In that case, 30 out of the 40 iterations will benefit from
217*38fd1498Szrj prefetching. */
218*38fd1498Szrj
219*38fd1498Szrj #ifndef TRIP_COUNT_TO_AHEAD_RATIO
220*38fd1498Szrj #define TRIP_COUNT_TO_AHEAD_RATIO 4
221*38fd1498Szrj #endif
222*38fd1498Szrj
223*38fd1498Szrj /* The group of references between that reuse may occur. */
224*38fd1498Szrj
225*38fd1498Szrj struct mem_ref_group
226*38fd1498Szrj {
227*38fd1498Szrj tree base; /* Base of the reference. */
228*38fd1498Szrj tree step; /* Step of the reference. */
229*38fd1498Szrj struct mem_ref *refs; /* References in the group. */
230*38fd1498Szrj struct mem_ref_group *next; /* Next group of references. */
231*38fd1498Szrj unsigned int uid; /* Group UID, used only for debugging. */
232*38fd1498Szrj };
233*38fd1498Szrj
234*38fd1498Szrj /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
235*38fd1498Szrj
236*38fd1498Szrj #define PREFETCH_ALL HOST_WIDE_INT_M1U
237*38fd1498Szrj
238*38fd1498Szrj /* Do not generate a prefetch if the unroll factor is significantly less
239*38fd1498Szrj than what is required by the prefetch. This is to avoid redundant
240*38fd1498Szrj prefetches. For example, when prefetch_mod is 16 and unroll_factor is
241*38fd1498Szrj 2, prefetching requires unrolling the loop 16 times, but
242*38fd1498Szrj the loop is actually unrolled twice. In this case (ratio = 8),
243*38fd1498Szrj prefetching is not likely to be beneficial. */
244*38fd1498Szrj
245*38fd1498Szrj #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
246*38fd1498Szrj #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
247*38fd1498Szrj #endif
248*38fd1498Szrj
249*38fd1498Szrj /* Some of the prefetch computations have quadratic complexity. We want to
250*38fd1498Szrj avoid huge compile times and, therefore, want to limit the amount of
251*38fd1498Szrj memory references per loop where we consider prefetching. */
252*38fd1498Szrj
253*38fd1498Szrj #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
254*38fd1498Szrj #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
255*38fd1498Szrj #endif
256*38fd1498Szrj
257*38fd1498Szrj /* The memory reference. */
258*38fd1498Szrj
259*38fd1498Szrj struct mem_ref
260*38fd1498Szrj {
261*38fd1498Szrj gimple *stmt; /* Statement in that the reference appears. */
262*38fd1498Szrj tree mem; /* The reference. */
263*38fd1498Szrj HOST_WIDE_INT delta; /* Constant offset of the reference. */
264*38fd1498Szrj struct mem_ref_group *group; /* The group of references it belongs to. */
265*38fd1498Szrj unsigned HOST_WIDE_INT prefetch_mod;
266*38fd1498Szrj /* Prefetch only each PREFETCH_MOD-th
267*38fd1498Szrj iteration. */
268*38fd1498Szrj unsigned HOST_WIDE_INT prefetch_before;
269*38fd1498Szrj /* Prefetch only first PREFETCH_BEFORE
270*38fd1498Szrj iterations. */
271*38fd1498Szrj unsigned reuse_distance; /* The amount of data accessed before the first
272*38fd1498Szrj reuse of this value. */
273*38fd1498Szrj struct mem_ref *next; /* The next reference in the group. */
274*38fd1498Szrj unsigned int uid; /* Ref UID, used only for debugging. */
275*38fd1498Szrj unsigned write_p : 1; /* Is it a write? */
276*38fd1498Szrj unsigned independent_p : 1; /* True if the reference is independent on
277*38fd1498Szrj all other references inside the loop. */
278*38fd1498Szrj unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
279*38fd1498Szrj unsigned storent_p : 1; /* True if we changed the store to a
280*38fd1498Szrj nontemporal one. */
281*38fd1498Szrj };
282*38fd1498Szrj
283*38fd1498Szrj /* Dumps information about memory reference */
284*38fd1498Szrj static void
dump_mem_details(FILE * file,tree base,tree step,HOST_WIDE_INT delta,bool write_p)285*38fd1498Szrj dump_mem_details (FILE *file, tree base, tree step,
286*38fd1498Szrj HOST_WIDE_INT delta, bool write_p)
287*38fd1498Szrj {
288*38fd1498Szrj fprintf (file, "(base ");
289*38fd1498Szrj print_generic_expr (file, base, TDF_SLIM);
290*38fd1498Szrj fprintf (file, ", step ");
291*38fd1498Szrj if (cst_and_fits_in_hwi (step))
292*38fd1498Szrj fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
293*38fd1498Szrj else
294*38fd1498Szrj print_generic_expr (file, step, TDF_SLIM);
295*38fd1498Szrj fprintf (file, ")\n");
296*38fd1498Szrj fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
297*38fd1498Szrj fprintf (file, " %s\n\n", write_p ? "write" : "read");
298*38fd1498Szrj }
299*38fd1498Szrj
300*38fd1498Szrj /* Dumps information about reference REF to FILE. */
301*38fd1498Szrj
302*38fd1498Szrj static void
dump_mem_ref(FILE * file,struct mem_ref * ref)303*38fd1498Szrj dump_mem_ref (FILE *file, struct mem_ref *ref)
304*38fd1498Szrj {
305*38fd1498Szrj fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
306*38fd1498Szrj print_generic_expr (file, ref->mem, TDF_SLIM);
307*38fd1498Szrj fprintf (file, ")\n");
308*38fd1498Szrj }
309*38fd1498Szrj
310*38fd1498Szrj /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
311*38fd1498Szrj exist. */
312*38fd1498Szrj
313*38fd1498Szrj static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)314*38fd1498Szrj find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
315*38fd1498Szrj {
316*38fd1498Szrj /* Global count for setting struct mem_ref_group->uid. */
317*38fd1498Szrj static unsigned int last_mem_ref_group_uid = 0;
318*38fd1498Szrj
319*38fd1498Szrj struct mem_ref_group *group;
320*38fd1498Szrj
321*38fd1498Szrj for (; *groups; groups = &(*groups)->next)
322*38fd1498Szrj {
323*38fd1498Szrj if (operand_equal_p ((*groups)->step, step, 0)
324*38fd1498Szrj && operand_equal_p ((*groups)->base, base, 0))
325*38fd1498Szrj return *groups;
326*38fd1498Szrj
327*38fd1498Szrj /* If step is an integer constant, keep the list of groups sorted
328*38fd1498Szrj by decreasing step. */
329*38fd1498Szrj if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
330*38fd1498Szrj && int_cst_value ((*groups)->step) < int_cst_value (step))
331*38fd1498Szrj break;
332*38fd1498Szrj }
333*38fd1498Szrj
334*38fd1498Szrj group = XNEW (struct mem_ref_group);
335*38fd1498Szrj group->base = base;
336*38fd1498Szrj group->step = step;
337*38fd1498Szrj group->refs = NULL;
338*38fd1498Szrj group->uid = ++last_mem_ref_group_uid;
339*38fd1498Szrj group->next = *groups;
340*38fd1498Szrj *groups = group;
341*38fd1498Szrj
342*38fd1498Szrj return group;
343*38fd1498Szrj }
344*38fd1498Szrj
345*38fd1498Szrj /* Records a memory reference MEM in GROUP with offset DELTA and write status
346*38fd1498Szrj WRITE_P. The reference occurs in statement STMT. */
347*38fd1498Szrj
348*38fd1498Szrj static void
record_ref(struct mem_ref_group * group,gimple * stmt,tree mem,HOST_WIDE_INT delta,bool write_p)349*38fd1498Szrj record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
350*38fd1498Szrj HOST_WIDE_INT delta, bool write_p)
351*38fd1498Szrj {
352*38fd1498Szrj unsigned int last_mem_ref_uid = 0;
353*38fd1498Szrj struct mem_ref **aref;
354*38fd1498Szrj
355*38fd1498Szrj /* Do not record the same address twice. */
356*38fd1498Szrj for (aref = &group->refs; *aref; aref = &(*aref)->next)
357*38fd1498Szrj {
358*38fd1498Szrj last_mem_ref_uid = (*aref)->uid;
359*38fd1498Szrj
360*38fd1498Szrj /* It does not have to be possible for write reference to reuse the read
361*38fd1498Szrj prefetch, or vice versa. */
362*38fd1498Szrj if (!WRITE_CAN_USE_READ_PREFETCH
363*38fd1498Szrj && write_p
364*38fd1498Szrj && !(*aref)->write_p)
365*38fd1498Szrj continue;
366*38fd1498Szrj if (!READ_CAN_USE_WRITE_PREFETCH
367*38fd1498Szrj && !write_p
368*38fd1498Szrj && (*aref)->write_p)
369*38fd1498Szrj continue;
370*38fd1498Szrj
371*38fd1498Szrj if ((*aref)->delta == delta)
372*38fd1498Szrj return;
373*38fd1498Szrj }
374*38fd1498Szrj
375*38fd1498Szrj (*aref) = XNEW (struct mem_ref);
376*38fd1498Szrj (*aref)->stmt = stmt;
377*38fd1498Szrj (*aref)->mem = mem;
378*38fd1498Szrj (*aref)->delta = delta;
379*38fd1498Szrj (*aref)->write_p = write_p;
380*38fd1498Szrj (*aref)->prefetch_before = PREFETCH_ALL;
381*38fd1498Szrj (*aref)->prefetch_mod = 1;
382*38fd1498Szrj (*aref)->reuse_distance = 0;
383*38fd1498Szrj (*aref)->issue_prefetch_p = false;
384*38fd1498Szrj (*aref)->group = group;
385*38fd1498Szrj (*aref)->next = NULL;
386*38fd1498Szrj (*aref)->independent_p = false;
387*38fd1498Szrj (*aref)->storent_p = false;
388*38fd1498Szrj (*aref)->uid = last_mem_ref_uid + 1;
389*38fd1498Szrj
390*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
391*38fd1498Szrj {
392*38fd1498Szrj dump_mem_ref (dump_file, *aref);
393*38fd1498Szrj
394*38fd1498Szrj fprintf (dump_file, " group %u ", group->uid);
395*38fd1498Szrj dump_mem_details (dump_file, group->base, group->step, delta,
396*38fd1498Szrj write_p);
397*38fd1498Szrj }
398*38fd1498Szrj }
399*38fd1498Szrj
400*38fd1498Szrj /* Release memory references in GROUPS. */
401*38fd1498Szrj
402*38fd1498Szrj static void
release_mem_refs(struct mem_ref_group * groups)403*38fd1498Szrj release_mem_refs (struct mem_ref_group *groups)
404*38fd1498Szrj {
405*38fd1498Szrj struct mem_ref_group *next_g;
406*38fd1498Szrj struct mem_ref *ref, *next_r;
407*38fd1498Szrj
408*38fd1498Szrj for (; groups; groups = next_g)
409*38fd1498Szrj {
410*38fd1498Szrj next_g = groups->next;
411*38fd1498Szrj for (ref = groups->refs; ref; ref = next_r)
412*38fd1498Szrj {
413*38fd1498Szrj next_r = ref->next;
414*38fd1498Szrj free (ref);
415*38fd1498Szrj }
416*38fd1498Szrj free (groups);
417*38fd1498Szrj }
418*38fd1498Szrj }
419*38fd1498Szrj
420*38fd1498Szrj /* A structure used to pass arguments to idx_analyze_ref. */
421*38fd1498Szrj
422*38fd1498Szrj struct ar_data
423*38fd1498Szrj {
424*38fd1498Szrj struct loop *loop; /* Loop of the reference. */
425*38fd1498Szrj gimple *stmt; /* Statement of the reference. */
426*38fd1498Szrj tree *step; /* Step of the memory reference. */
427*38fd1498Szrj HOST_WIDE_INT *delta; /* Offset of the memory reference. */
428*38fd1498Szrj };
429*38fd1498Szrj
430*38fd1498Szrj /* Analyzes a single INDEX of a memory reference to obtain information
431*38fd1498Szrj described at analyze_ref. Callback for for_each_index. */
432*38fd1498Szrj
433*38fd1498Szrj static bool
idx_analyze_ref(tree base,tree * index,void * data)434*38fd1498Szrj idx_analyze_ref (tree base, tree *index, void *data)
435*38fd1498Szrj {
436*38fd1498Szrj struct ar_data *ar_data = (struct ar_data *) data;
437*38fd1498Szrj tree ibase, step, stepsize;
438*38fd1498Szrj HOST_WIDE_INT idelta = 0, imult = 1;
439*38fd1498Szrj affine_iv iv;
440*38fd1498Szrj
441*38fd1498Szrj if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
442*38fd1498Szrj *index, &iv, true))
443*38fd1498Szrj return false;
444*38fd1498Szrj ibase = iv.base;
445*38fd1498Szrj step = iv.step;
446*38fd1498Szrj
447*38fd1498Szrj if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
448*38fd1498Szrj && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
449*38fd1498Szrj {
450*38fd1498Szrj idelta = int_cst_value (TREE_OPERAND (ibase, 1));
451*38fd1498Szrj ibase = TREE_OPERAND (ibase, 0);
452*38fd1498Szrj }
453*38fd1498Szrj if (cst_and_fits_in_hwi (ibase))
454*38fd1498Szrj {
455*38fd1498Szrj idelta += int_cst_value (ibase);
456*38fd1498Szrj ibase = build_int_cst (TREE_TYPE (ibase), 0);
457*38fd1498Szrj }
458*38fd1498Szrj
459*38fd1498Szrj if (TREE_CODE (base) == ARRAY_REF)
460*38fd1498Szrj {
461*38fd1498Szrj stepsize = array_ref_element_size (base);
462*38fd1498Szrj if (!cst_and_fits_in_hwi (stepsize))
463*38fd1498Szrj return false;
464*38fd1498Szrj imult = int_cst_value (stepsize);
465*38fd1498Szrj step = fold_build2 (MULT_EXPR, sizetype,
466*38fd1498Szrj fold_convert (sizetype, step),
467*38fd1498Szrj fold_convert (sizetype, stepsize));
468*38fd1498Szrj idelta *= imult;
469*38fd1498Szrj }
470*38fd1498Szrj
471*38fd1498Szrj if (*ar_data->step == NULL_TREE)
472*38fd1498Szrj *ar_data->step = step;
473*38fd1498Szrj else
474*38fd1498Szrj *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
475*38fd1498Szrj fold_convert (sizetype, *ar_data->step),
476*38fd1498Szrj fold_convert (sizetype, step));
477*38fd1498Szrj *ar_data->delta += idelta;
478*38fd1498Szrj *index = ibase;
479*38fd1498Szrj
480*38fd1498Szrj return true;
481*38fd1498Szrj }
482*38fd1498Szrj
483*38fd1498Szrj /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
484*38fd1498Szrj STEP are integer constants and iter is number of iterations of LOOP. The
485*38fd1498Szrj reference occurs in statement STMT. Strips nonaddressable component
486*38fd1498Szrj references from REF_P. */
487*38fd1498Szrj
488*38fd1498Szrj static bool
analyze_ref(struct loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple * stmt)489*38fd1498Szrj analyze_ref (struct loop *loop, tree *ref_p, tree *base,
490*38fd1498Szrj tree *step, HOST_WIDE_INT *delta,
491*38fd1498Szrj gimple *stmt)
492*38fd1498Szrj {
493*38fd1498Szrj struct ar_data ar_data;
494*38fd1498Szrj tree off;
495*38fd1498Szrj HOST_WIDE_INT bit_offset;
496*38fd1498Szrj tree ref = *ref_p;
497*38fd1498Szrj
498*38fd1498Szrj *step = NULL_TREE;
499*38fd1498Szrj *delta = 0;
500*38fd1498Szrj
501*38fd1498Szrj /* First strip off the component references. Ignore bitfields.
502*38fd1498Szrj Also strip off the real and imagine parts of a complex, so that
503*38fd1498Szrj they can have the same base. */
504*38fd1498Szrj if (TREE_CODE (ref) == REALPART_EXPR
505*38fd1498Szrj || TREE_CODE (ref) == IMAGPART_EXPR
506*38fd1498Szrj || (TREE_CODE (ref) == COMPONENT_REF
507*38fd1498Szrj && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
508*38fd1498Szrj {
509*38fd1498Szrj if (TREE_CODE (ref) == IMAGPART_EXPR)
510*38fd1498Szrj *delta += int_size_in_bytes (TREE_TYPE (ref));
511*38fd1498Szrj ref = TREE_OPERAND (ref, 0);
512*38fd1498Szrj }
513*38fd1498Szrj
514*38fd1498Szrj *ref_p = ref;
515*38fd1498Szrj
516*38fd1498Szrj for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
517*38fd1498Szrj {
518*38fd1498Szrj off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
519*38fd1498Szrj bit_offset = TREE_INT_CST_LOW (off);
520*38fd1498Szrj gcc_assert (bit_offset % BITS_PER_UNIT == 0);
521*38fd1498Szrj
522*38fd1498Szrj *delta += bit_offset / BITS_PER_UNIT;
523*38fd1498Szrj }
524*38fd1498Szrj
525*38fd1498Szrj *base = unshare_expr (ref);
526*38fd1498Szrj ar_data.loop = loop;
527*38fd1498Szrj ar_data.stmt = stmt;
528*38fd1498Szrj ar_data.step = step;
529*38fd1498Szrj ar_data.delta = delta;
530*38fd1498Szrj return for_each_index (base, idx_analyze_ref, &ar_data);
531*38fd1498Szrj }
532*38fd1498Szrj
533*38fd1498Szrj /* Record a memory reference REF to the list REFS. The reference occurs in
534*38fd1498Szrj LOOP in statement STMT and it is write if WRITE_P. Returns true if the
535*38fd1498Szrj reference was recorded, false otherwise. */
536*38fd1498Szrj
537*38fd1498Szrj static bool
gather_memory_references_ref(struct loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple * stmt)538*38fd1498Szrj gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
539*38fd1498Szrj tree ref, bool write_p, gimple *stmt)
540*38fd1498Szrj {
541*38fd1498Szrj tree base, step;
542*38fd1498Szrj HOST_WIDE_INT delta;
543*38fd1498Szrj struct mem_ref_group *agrp;
544*38fd1498Szrj
545*38fd1498Szrj if (get_base_address (ref) == NULL)
546*38fd1498Szrj return false;
547*38fd1498Szrj
548*38fd1498Szrj if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
549*38fd1498Szrj return false;
550*38fd1498Szrj /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
551*38fd1498Szrj if (step == NULL_TREE)
552*38fd1498Szrj return false;
553*38fd1498Szrj
554*38fd1498Szrj /* Stop if the address of BASE could not be taken. */
555*38fd1498Szrj if (may_be_nonaddressable_p (base))
556*38fd1498Szrj return false;
557*38fd1498Szrj
558*38fd1498Szrj /* Limit non-constant step prefetching only to the innermost loops and
559*38fd1498Szrj only when the step is loop invariant in the entire loop nest. */
560*38fd1498Szrj if (!cst_and_fits_in_hwi (step))
561*38fd1498Szrj {
562*38fd1498Szrj if (loop->inner != NULL)
563*38fd1498Szrj {
564*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
565*38fd1498Szrj {
566*38fd1498Szrj fprintf (dump_file, "Memory expression %p\n",(void *) ref );
567*38fd1498Szrj print_generic_expr (dump_file, ref, TDF_SLIM);
568*38fd1498Szrj fprintf (dump_file,":");
569*38fd1498Szrj dump_mem_details (dump_file, base, step, delta, write_p);
570*38fd1498Szrj fprintf (dump_file,
571*38fd1498Szrj "Ignoring %p, non-constant step prefetching is "
572*38fd1498Szrj "limited to inner most loops \n",
573*38fd1498Szrj (void *) ref);
574*38fd1498Szrj }
575*38fd1498Szrj return false;
576*38fd1498Szrj }
577*38fd1498Szrj else
578*38fd1498Szrj {
579*38fd1498Szrj if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
580*38fd1498Szrj {
581*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
582*38fd1498Szrj {
583*38fd1498Szrj fprintf (dump_file, "Memory expression %p\n",(void *) ref );
584*38fd1498Szrj print_generic_expr (dump_file, ref, TDF_SLIM);
585*38fd1498Szrj fprintf (dump_file,":");
586*38fd1498Szrj dump_mem_details (dump_file, base, step, delta, write_p);
587*38fd1498Szrj fprintf (dump_file,
588*38fd1498Szrj "Not prefetching, ignoring %p due to "
589*38fd1498Szrj "loop variant step\n",
590*38fd1498Szrj (void *) ref);
591*38fd1498Szrj }
592*38fd1498Szrj return false;
593*38fd1498Szrj }
594*38fd1498Szrj }
595*38fd1498Szrj }
596*38fd1498Szrj
597*38fd1498Szrj /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
598*38fd1498Szrj are integer constants. */
599*38fd1498Szrj agrp = find_or_create_group (refs, base, step);
600*38fd1498Szrj record_ref (agrp, stmt, ref, delta, write_p);
601*38fd1498Szrj
602*38fd1498Szrj return true;
603*38fd1498Szrj }
604*38fd1498Szrj
605*38fd1498Szrj /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
606*38fd1498Szrj true if there are no other memory references inside the loop. */
607*38fd1498Szrj
608*38fd1498Szrj static struct mem_ref_group *
gather_memory_references(struct loop * loop,bool * no_other_refs,unsigned * ref_count)609*38fd1498Szrj gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
610*38fd1498Szrj {
611*38fd1498Szrj basic_block *body = get_loop_body_in_dom_order (loop);
612*38fd1498Szrj basic_block bb;
613*38fd1498Szrj unsigned i;
614*38fd1498Szrj gimple_stmt_iterator bsi;
615*38fd1498Szrj gimple *stmt;
616*38fd1498Szrj tree lhs, rhs;
617*38fd1498Szrj struct mem_ref_group *refs = NULL;
618*38fd1498Szrj
619*38fd1498Szrj *no_other_refs = true;
620*38fd1498Szrj *ref_count = 0;
621*38fd1498Szrj
622*38fd1498Szrj /* Scan the loop body in order, so that the former references precede the
623*38fd1498Szrj later ones. */
624*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
625*38fd1498Szrj {
626*38fd1498Szrj bb = body[i];
627*38fd1498Szrj if (bb->loop_father != loop)
628*38fd1498Szrj continue;
629*38fd1498Szrj
630*38fd1498Szrj for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
631*38fd1498Szrj {
632*38fd1498Szrj stmt = gsi_stmt (bsi);
633*38fd1498Szrj
634*38fd1498Szrj if (gimple_code (stmt) != GIMPLE_ASSIGN)
635*38fd1498Szrj {
636*38fd1498Szrj if (gimple_vuse (stmt)
637*38fd1498Szrj || (is_gimple_call (stmt)
638*38fd1498Szrj && !(gimple_call_flags (stmt) & ECF_CONST)))
639*38fd1498Szrj *no_other_refs = false;
640*38fd1498Szrj continue;
641*38fd1498Szrj }
642*38fd1498Szrj
643*38fd1498Szrj if (! gimple_vuse (stmt))
644*38fd1498Szrj continue;
645*38fd1498Szrj
646*38fd1498Szrj lhs = gimple_assign_lhs (stmt);
647*38fd1498Szrj rhs = gimple_assign_rhs1 (stmt);
648*38fd1498Szrj
649*38fd1498Szrj if (REFERENCE_CLASS_P (rhs))
650*38fd1498Szrj {
651*38fd1498Szrj *no_other_refs &= gather_memory_references_ref (loop, &refs,
652*38fd1498Szrj rhs, false, stmt);
653*38fd1498Szrj *ref_count += 1;
654*38fd1498Szrj }
655*38fd1498Szrj if (REFERENCE_CLASS_P (lhs))
656*38fd1498Szrj {
657*38fd1498Szrj *no_other_refs &= gather_memory_references_ref (loop, &refs,
658*38fd1498Szrj lhs, true, stmt);
659*38fd1498Szrj *ref_count += 1;
660*38fd1498Szrj }
661*38fd1498Szrj }
662*38fd1498Szrj }
663*38fd1498Szrj free (body);
664*38fd1498Szrj
665*38fd1498Szrj return refs;
666*38fd1498Szrj }
667*38fd1498Szrj
668*38fd1498Szrj /* Prune the prefetch candidate REF using the self-reuse. */
669*38fd1498Szrj
670*38fd1498Szrj static void
prune_ref_by_self_reuse(struct mem_ref * ref)671*38fd1498Szrj prune_ref_by_self_reuse (struct mem_ref *ref)
672*38fd1498Szrj {
673*38fd1498Szrj HOST_WIDE_INT step;
674*38fd1498Szrj bool backward;
675*38fd1498Szrj
676*38fd1498Szrj /* If the step size is non constant, we cannot calculate prefetch_mod. */
677*38fd1498Szrj if (!cst_and_fits_in_hwi (ref->group->step))
678*38fd1498Szrj return;
679*38fd1498Szrj
680*38fd1498Szrj step = int_cst_value (ref->group->step);
681*38fd1498Szrj
682*38fd1498Szrj backward = step < 0;
683*38fd1498Szrj
684*38fd1498Szrj if (step == 0)
685*38fd1498Szrj {
686*38fd1498Szrj /* Prefetch references to invariant address just once. */
687*38fd1498Szrj ref->prefetch_before = 1;
688*38fd1498Szrj return;
689*38fd1498Szrj }
690*38fd1498Szrj
691*38fd1498Szrj if (backward)
692*38fd1498Szrj step = -step;
693*38fd1498Szrj
694*38fd1498Szrj if (step > PREFETCH_BLOCK)
695*38fd1498Szrj return;
696*38fd1498Szrj
697*38fd1498Szrj if ((backward && HAVE_BACKWARD_PREFETCH)
698*38fd1498Szrj || (!backward && HAVE_FORWARD_PREFETCH))
699*38fd1498Szrj {
700*38fd1498Szrj ref->prefetch_before = 1;
701*38fd1498Szrj return;
702*38fd1498Szrj }
703*38fd1498Szrj
704*38fd1498Szrj ref->prefetch_mod = PREFETCH_BLOCK / step;
705*38fd1498Szrj }
706*38fd1498Szrj
707*38fd1498Szrj /* Divides X by BY, rounding down. */
708*38fd1498Szrj
709*38fd1498Szrj static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)710*38fd1498Szrj ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
711*38fd1498Szrj {
712*38fd1498Szrj gcc_assert (by > 0);
713*38fd1498Szrj
714*38fd1498Szrj if (x >= 0)
715*38fd1498Szrj return x / (HOST_WIDE_INT) by;
716*38fd1498Szrj else
717*38fd1498Szrj return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
718*38fd1498Szrj }
719*38fd1498Szrj
720*38fd1498Szrj /* Given a CACHE_LINE_SIZE and two inductive memory references
721*38fd1498Szrj with a common STEP greater than CACHE_LINE_SIZE and an address
722*38fd1498Szrj difference DELTA, compute the probability that they will fall
723*38fd1498Szrj in different cache lines. Return true if the computed miss rate
724*38fd1498Szrj is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
725*38fd1498Szrj number of distinct iterations after which the pattern repeats itself.
726*38fd1498Szrj ALIGN_UNIT is the unit of alignment in bytes. */
727*38fd1498Szrj
728*38fd1498Szrj static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)729*38fd1498Szrj is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
730*38fd1498Szrj HOST_WIDE_INT step, HOST_WIDE_INT delta,
731*38fd1498Szrj unsigned HOST_WIDE_INT distinct_iters,
732*38fd1498Szrj int align_unit)
733*38fd1498Szrj {
734*38fd1498Szrj unsigned align, iter;
735*38fd1498Szrj int total_positions, miss_positions, max_allowed_miss_positions;
736*38fd1498Szrj int address1, address2, cache_line1, cache_line2;
737*38fd1498Szrj
738*38fd1498Szrj /* It always misses if delta is greater than or equal to the cache
739*38fd1498Szrj line size. */
740*38fd1498Szrj if (delta >= (HOST_WIDE_INT) cache_line_size)
741*38fd1498Szrj return false;
742*38fd1498Szrj
743*38fd1498Szrj miss_positions = 0;
744*38fd1498Szrj total_positions = (cache_line_size / align_unit) * distinct_iters;
745*38fd1498Szrj max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
746*38fd1498Szrj
747*38fd1498Szrj /* Iterate through all possible alignments of the first
748*38fd1498Szrj memory reference within its cache line. */
749*38fd1498Szrj for (align = 0; align < cache_line_size; align += align_unit)
750*38fd1498Szrj
751*38fd1498Szrj /* Iterate through all distinct iterations. */
752*38fd1498Szrj for (iter = 0; iter < distinct_iters; iter++)
753*38fd1498Szrj {
754*38fd1498Szrj address1 = align + step * iter;
755*38fd1498Szrj address2 = address1 + delta;
756*38fd1498Szrj cache_line1 = address1 / cache_line_size;
757*38fd1498Szrj cache_line2 = address2 / cache_line_size;
758*38fd1498Szrj if (cache_line1 != cache_line2)
759*38fd1498Szrj {
760*38fd1498Szrj miss_positions += 1;
761*38fd1498Szrj if (miss_positions > max_allowed_miss_positions)
762*38fd1498Szrj return false;
763*38fd1498Szrj }
764*38fd1498Szrj }
765*38fd1498Szrj return true;
766*38fd1498Szrj }
767*38fd1498Szrj
768*38fd1498Szrj /* Prune the prefetch candidate REF using the reuse with BY.
769*38fd1498Szrj If BY_IS_BEFORE is true, BY is before REF in the loop. */
770*38fd1498Szrj
771*38fd1498Szrj static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)772*38fd1498Szrj prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
773*38fd1498Szrj bool by_is_before)
774*38fd1498Szrj {
775*38fd1498Szrj HOST_WIDE_INT step;
776*38fd1498Szrj bool backward;
777*38fd1498Szrj HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
778*38fd1498Szrj HOST_WIDE_INT delta = delta_b - delta_r;
779*38fd1498Szrj HOST_WIDE_INT hit_from;
780*38fd1498Szrj unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
781*38fd1498Szrj HOST_WIDE_INT reduced_step;
782*38fd1498Szrj unsigned HOST_WIDE_INT reduced_prefetch_block;
783*38fd1498Szrj tree ref_type;
784*38fd1498Szrj int align_unit;
785*38fd1498Szrj
786*38fd1498Szrj /* If the step is non constant we cannot calculate prefetch_before. */
787*38fd1498Szrj if (!cst_and_fits_in_hwi (ref->group->step)) {
788*38fd1498Szrj return;
789*38fd1498Szrj }
790*38fd1498Szrj
791*38fd1498Szrj step = int_cst_value (ref->group->step);
792*38fd1498Szrj
793*38fd1498Szrj backward = step < 0;
794*38fd1498Szrj
795*38fd1498Szrj
796*38fd1498Szrj if (delta == 0)
797*38fd1498Szrj {
798*38fd1498Szrj /* If the references has the same address, only prefetch the
799*38fd1498Szrj former. */
800*38fd1498Szrj if (by_is_before)
801*38fd1498Szrj ref->prefetch_before = 0;
802*38fd1498Szrj
803*38fd1498Szrj return;
804*38fd1498Szrj }
805*38fd1498Szrj
806*38fd1498Szrj if (!step)
807*38fd1498Szrj {
808*38fd1498Szrj /* If the reference addresses are invariant and fall into the
809*38fd1498Szrj same cache line, prefetch just the first one. */
810*38fd1498Szrj if (!by_is_before)
811*38fd1498Szrj return;
812*38fd1498Szrj
813*38fd1498Szrj if (ddown (ref->delta, PREFETCH_BLOCK)
814*38fd1498Szrj != ddown (by->delta, PREFETCH_BLOCK))
815*38fd1498Szrj return;
816*38fd1498Szrj
817*38fd1498Szrj ref->prefetch_before = 0;
818*38fd1498Szrj return;
819*38fd1498Szrj }
820*38fd1498Szrj
821*38fd1498Szrj /* Only prune the reference that is behind in the array. */
822*38fd1498Szrj if (backward)
823*38fd1498Szrj {
824*38fd1498Szrj if (delta > 0)
825*38fd1498Szrj return;
826*38fd1498Szrj
827*38fd1498Szrj /* Transform the data so that we may assume that the accesses
828*38fd1498Szrj are forward. */
829*38fd1498Szrj delta = - delta;
830*38fd1498Szrj step = -step;
831*38fd1498Szrj delta_r = PREFETCH_BLOCK - 1 - delta_r;
832*38fd1498Szrj delta_b = PREFETCH_BLOCK - 1 - delta_b;
833*38fd1498Szrj }
834*38fd1498Szrj else
835*38fd1498Szrj {
836*38fd1498Szrj if (delta < 0)
837*38fd1498Szrj return;
838*38fd1498Szrj }
839*38fd1498Szrj
840*38fd1498Szrj /* Check whether the two references are likely to hit the same cache
841*38fd1498Szrj line, and how distant the iterations in that it occurs are from
842*38fd1498Szrj each other. */
843*38fd1498Szrj
844*38fd1498Szrj if (step <= PREFETCH_BLOCK)
845*38fd1498Szrj {
846*38fd1498Szrj /* The accesses are sure to meet. Let us check when. */
847*38fd1498Szrj hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
848*38fd1498Szrj prefetch_before = (hit_from - delta_r + step - 1) / step;
849*38fd1498Szrj
850*38fd1498Szrj /* Do not reduce prefetch_before if we meet beyond cache size. */
851*38fd1498Szrj if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
852*38fd1498Szrj prefetch_before = PREFETCH_ALL;
853*38fd1498Szrj if (prefetch_before < ref->prefetch_before)
854*38fd1498Szrj ref->prefetch_before = prefetch_before;
855*38fd1498Szrj
856*38fd1498Szrj return;
857*38fd1498Szrj }
858*38fd1498Szrj
859*38fd1498Szrj /* A more complicated case with step > prefetch_block. First reduce
860*38fd1498Szrj the ratio between the step and the cache line size to its simplest
861*38fd1498Szrj terms. The resulting denominator will then represent the number of
862*38fd1498Szrj distinct iterations after which each address will go back to its
863*38fd1498Szrj initial location within the cache line. This computation assumes
864*38fd1498Szrj that PREFETCH_BLOCK is a power of two. */
865*38fd1498Szrj prefetch_block = PREFETCH_BLOCK;
866*38fd1498Szrj reduced_prefetch_block = prefetch_block;
867*38fd1498Szrj reduced_step = step;
868*38fd1498Szrj while ((reduced_step & 1) == 0
869*38fd1498Szrj && reduced_prefetch_block > 1)
870*38fd1498Szrj {
871*38fd1498Szrj reduced_step >>= 1;
872*38fd1498Szrj reduced_prefetch_block >>= 1;
873*38fd1498Szrj }
874*38fd1498Szrj
875*38fd1498Szrj prefetch_before = delta / step;
876*38fd1498Szrj delta %= step;
877*38fd1498Szrj ref_type = TREE_TYPE (ref->mem);
878*38fd1498Szrj align_unit = TYPE_ALIGN (ref_type) / 8;
879*38fd1498Szrj if (is_miss_rate_acceptable (prefetch_block, step, delta,
880*38fd1498Szrj reduced_prefetch_block, align_unit))
881*38fd1498Szrj {
882*38fd1498Szrj /* Do not reduce prefetch_before if we meet beyond cache size. */
883*38fd1498Szrj if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
884*38fd1498Szrj prefetch_before = PREFETCH_ALL;
885*38fd1498Szrj if (prefetch_before < ref->prefetch_before)
886*38fd1498Szrj ref->prefetch_before = prefetch_before;
887*38fd1498Szrj
888*38fd1498Szrj return;
889*38fd1498Szrj }
890*38fd1498Szrj
891*38fd1498Szrj /* Try also the following iteration. */
892*38fd1498Szrj prefetch_before++;
893*38fd1498Szrj delta = step - delta;
894*38fd1498Szrj if (is_miss_rate_acceptable (prefetch_block, step, delta,
895*38fd1498Szrj reduced_prefetch_block, align_unit))
896*38fd1498Szrj {
897*38fd1498Szrj if (prefetch_before < ref->prefetch_before)
898*38fd1498Szrj ref->prefetch_before = prefetch_before;
899*38fd1498Szrj
900*38fd1498Szrj return;
901*38fd1498Szrj }
902*38fd1498Szrj
903*38fd1498Szrj /* The ref probably does not reuse by. */
904*38fd1498Szrj return;
905*38fd1498Szrj }
906*38fd1498Szrj
907*38fd1498Szrj /* Prune the prefetch candidate REF using the reuses with other references
908*38fd1498Szrj in REFS. */
909*38fd1498Szrj
910*38fd1498Szrj static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)911*38fd1498Szrj prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
912*38fd1498Szrj {
913*38fd1498Szrj struct mem_ref *prune_by;
914*38fd1498Szrj bool before = true;
915*38fd1498Szrj
916*38fd1498Szrj prune_ref_by_self_reuse (ref);
917*38fd1498Szrj
918*38fd1498Szrj for (prune_by = refs; prune_by; prune_by = prune_by->next)
919*38fd1498Szrj {
920*38fd1498Szrj if (prune_by == ref)
921*38fd1498Szrj {
922*38fd1498Szrj before = false;
923*38fd1498Szrj continue;
924*38fd1498Szrj }
925*38fd1498Szrj
926*38fd1498Szrj if (!WRITE_CAN_USE_READ_PREFETCH
927*38fd1498Szrj && ref->write_p
928*38fd1498Szrj && !prune_by->write_p)
929*38fd1498Szrj continue;
930*38fd1498Szrj if (!READ_CAN_USE_WRITE_PREFETCH
931*38fd1498Szrj && !ref->write_p
932*38fd1498Szrj && prune_by->write_p)
933*38fd1498Szrj continue;
934*38fd1498Szrj
935*38fd1498Szrj prune_ref_by_group_reuse (ref, prune_by, before);
936*38fd1498Szrj }
937*38fd1498Szrj }
938*38fd1498Szrj
939*38fd1498Szrj /* Prune the prefetch candidates in GROUP using the reuse analysis. */
940*38fd1498Szrj
941*38fd1498Szrj static void
prune_group_by_reuse(struct mem_ref_group * group)942*38fd1498Szrj prune_group_by_reuse (struct mem_ref_group *group)
943*38fd1498Szrj {
944*38fd1498Szrj struct mem_ref *ref_pruned;
945*38fd1498Szrj
946*38fd1498Szrj for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
947*38fd1498Szrj {
948*38fd1498Szrj prune_ref_by_reuse (ref_pruned, group->refs);
949*38fd1498Szrj
950*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
951*38fd1498Szrj {
952*38fd1498Szrj dump_mem_ref (dump_file, ref_pruned);
953*38fd1498Szrj
954*38fd1498Szrj if (ref_pruned->prefetch_before == PREFETCH_ALL
955*38fd1498Szrj && ref_pruned->prefetch_mod == 1)
956*38fd1498Szrj fprintf (dump_file, " no restrictions");
957*38fd1498Szrj else if (ref_pruned->prefetch_before == 0)
958*38fd1498Szrj fprintf (dump_file, " do not prefetch");
959*38fd1498Szrj else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
960*38fd1498Szrj fprintf (dump_file, " prefetch once");
961*38fd1498Szrj else
962*38fd1498Szrj {
963*38fd1498Szrj if (ref_pruned->prefetch_before != PREFETCH_ALL)
964*38fd1498Szrj {
965*38fd1498Szrj fprintf (dump_file, " prefetch before ");
966*38fd1498Szrj fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
967*38fd1498Szrj ref_pruned->prefetch_before);
968*38fd1498Szrj }
969*38fd1498Szrj if (ref_pruned->prefetch_mod != 1)
970*38fd1498Szrj {
971*38fd1498Szrj fprintf (dump_file, " prefetch mod ");
972*38fd1498Szrj fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
973*38fd1498Szrj ref_pruned->prefetch_mod);
974*38fd1498Szrj }
975*38fd1498Szrj }
976*38fd1498Szrj fprintf (dump_file, "\n");
977*38fd1498Szrj }
978*38fd1498Szrj }
979*38fd1498Szrj }
980*38fd1498Szrj
981*38fd1498Szrj /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
982*38fd1498Szrj
983*38fd1498Szrj static void
prune_by_reuse(struct mem_ref_group * groups)984*38fd1498Szrj prune_by_reuse (struct mem_ref_group *groups)
985*38fd1498Szrj {
986*38fd1498Szrj for (; groups; groups = groups->next)
987*38fd1498Szrj prune_group_by_reuse (groups);
988*38fd1498Szrj }
989*38fd1498Szrj
990*38fd1498Szrj /* Returns true if we should issue prefetch for REF. */
991*38fd1498Szrj
992*38fd1498Szrj static bool
should_issue_prefetch_p(struct mem_ref * ref)993*38fd1498Szrj should_issue_prefetch_p (struct mem_ref *ref)
994*38fd1498Szrj {
995*38fd1498Szrj /* For now do not issue prefetches for only first few of the
996*38fd1498Szrj iterations. */
997*38fd1498Szrj if (ref->prefetch_before != PREFETCH_ALL)
998*38fd1498Szrj {
999*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1000*38fd1498Szrj fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1001*38fd1498Szrj ref->group->uid, ref->uid);
1002*38fd1498Szrj return false;
1003*38fd1498Szrj }
1004*38fd1498Szrj
1005*38fd1498Szrj /* Do not prefetch nontemporal stores. */
1006*38fd1498Szrj if (ref->storent_p)
1007*38fd1498Szrj {
1008*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1009*38fd1498Szrj fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
1010*38fd1498Szrj return false;
1011*38fd1498Szrj }
1012*38fd1498Szrj
1013*38fd1498Szrj return true;
1014*38fd1498Szrj }
1015*38fd1498Szrj
1016*38fd1498Szrj /* Decide which of the prefetch candidates in GROUPS to prefetch.
1017*38fd1498Szrj AHEAD is the number of iterations to prefetch ahead (which corresponds
1018*38fd1498Szrj to the number of simultaneous instances of one prefetch running at a
1019*38fd1498Szrj time). UNROLL_FACTOR is the factor by that the loop is going to be
1020*38fd1498Szrj unrolled. Returns true if there is anything to prefetch. */
1021*38fd1498Szrj
1022*38fd1498Szrj static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1023*38fd1498Szrj schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1024*38fd1498Szrj unsigned ahead)
1025*38fd1498Szrj {
1026*38fd1498Szrj unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1027*38fd1498Szrj unsigned slots_per_prefetch;
1028*38fd1498Szrj struct mem_ref *ref;
1029*38fd1498Szrj bool any = false;
1030*38fd1498Szrj
1031*38fd1498Szrj /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
1032*38fd1498Szrj remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1033*38fd1498Szrj
1034*38fd1498Szrj /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1035*38fd1498Szrj AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1036*38fd1498Szrj it will need a prefetch slot. */
1037*38fd1498Szrj slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1038*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1039*38fd1498Szrj fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1040*38fd1498Szrj slots_per_prefetch);
1041*38fd1498Szrj
1042*38fd1498Szrj /* For now we just take memory references one by one and issue
1043*38fd1498Szrj prefetches for as many as possible. The groups are sorted
1044*38fd1498Szrj starting with the largest step, since the references with
1045*38fd1498Szrj large step are more likely to cause many cache misses. */
1046*38fd1498Szrj
1047*38fd1498Szrj for (; groups; groups = groups->next)
1048*38fd1498Szrj for (ref = groups->refs; ref; ref = ref->next)
1049*38fd1498Szrj {
1050*38fd1498Szrj if (!should_issue_prefetch_p (ref))
1051*38fd1498Szrj continue;
1052*38fd1498Szrj
1053*38fd1498Szrj /* The loop is far from being sufficiently unrolled for this
1054*38fd1498Szrj prefetch. Do not generate prefetch to avoid many redudant
1055*38fd1498Szrj prefetches. */
1056*38fd1498Szrj if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1057*38fd1498Szrj continue;
1058*38fd1498Szrj
1059*38fd1498Szrj /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1060*38fd1498Szrj and we unroll the loop UNROLL_FACTOR times, we need to insert
1061*38fd1498Szrj ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1062*38fd1498Szrj iteration. */
1063*38fd1498Szrj n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1064*38fd1498Szrj / ref->prefetch_mod);
1065*38fd1498Szrj prefetch_slots = n_prefetches * slots_per_prefetch;
1066*38fd1498Szrj
1067*38fd1498Szrj /* If more than half of the prefetches would be lost anyway, do not
1068*38fd1498Szrj issue the prefetch. */
1069*38fd1498Szrj if (2 * remaining_prefetch_slots < prefetch_slots)
1070*38fd1498Szrj continue;
1071*38fd1498Szrj
1072*38fd1498Szrj /* Stop prefetching if debug counter is activated. */
1073*38fd1498Szrj if (!dbg_cnt (prefetch))
1074*38fd1498Szrj continue;
1075*38fd1498Szrj
1076*38fd1498Szrj ref->issue_prefetch_p = true;
1077*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1078*38fd1498Szrj fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1079*38fd1498Szrj ref->group->uid, ref->uid);
1080*38fd1498Szrj
1081*38fd1498Szrj if (remaining_prefetch_slots <= prefetch_slots)
1082*38fd1498Szrj return true;
1083*38fd1498Szrj remaining_prefetch_slots -= prefetch_slots;
1084*38fd1498Szrj any = true;
1085*38fd1498Szrj }
1086*38fd1498Szrj
1087*38fd1498Szrj return any;
1088*38fd1498Szrj }
1089*38fd1498Szrj
1090*38fd1498Szrj /* Return TRUE if no prefetch is going to be generated in the given
1091*38fd1498Szrj GROUPS. */
1092*38fd1498Szrj
1093*38fd1498Szrj static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)1094*38fd1498Szrj nothing_to_prefetch_p (struct mem_ref_group *groups)
1095*38fd1498Szrj {
1096*38fd1498Szrj struct mem_ref *ref;
1097*38fd1498Szrj
1098*38fd1498Szrj for (; groups; groups = groups->next)
1099*38fd1498Szrj for (ref = groups->refs; ref; ref = ref->next)
1100*38fd1498Szrj if (should_issue_prefetch_p (ref))
1101*38fd1498Szrj return false;
1102*38fd1498Szrj
1103*38fd1498Szrj return true;
1104*38fd1498Szrj }
1105*38fd1498Szrj
1106*38fd1498Szrj /* Estimate the number of prefetches in the given GROUPS.
1107*38fd1498Szrj UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1108*38fd1498Szrj
1109*38fd1498Szrj static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)1110*38fd1498Szrj estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1111*38fd1498Szrj {
1112*38fd1498Szrj struct mem_ref *ref;
1113*38fd1498Szrj unsigned n_prefetches;
1114*38fd1498Szrj int prefetch_count = 0;
1115*38fd1498Szrj
1116*38fd1498Szrj for (; groups; groups = groups->next)
1117*38fd1498Szrj for (ref = groups->refs; ref; ref = ref->next)
1118*38fd1498Szrj if (should_issue_prefetch_p (ref))
1119*38fd1498Szrj {
1120*38fd1498Szrj n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1121*38fd1498Szrj / ref->prefetch_mod);
1122*38fd1498Szrj prefetch_count += n_prefetches;
1123*38fd1498Szrj }
1124*38fd1498Szrj
1125*38fd1498Szrj return prefetch_count;
1126*38fd1498Szrj }
1127*38fd1498Szrj
1128*38fd1498Szrj /* Issue prefetches for the reference REF into loop as decided before.
1129*38fd1498Szrj HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1130*38fd1498Szrj is the factor by which LOOP was unrolled. */
1131*38fd1498Szrj
1132*38fd1498Szrj static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)1133*38fd1498Szrj issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1134*38fd1498Szrj {
1135*38fd1498Szrj HOST_WIDE_INT delta;
1136*38fd1498Szrj tree addr, addr_base, write_p, local, forward;
1137*38fd1498Szrj gcall *prefetch;
1138*38fd1498Szrj gimple_stmt_iterator bsi;
1139*38fd1498Szrj unsigned n_prefetches, ap;
1140*38fd1498Szrj bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1141*38fd1498Szrj
1142*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1143*38fd1498Szrj fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
1144*38fd1498Szrj nontemporal ? " nontemporal" : "",
1145*38fd1498Szrj ref->group->uid, ref->uid);
1146*38fd1498Szrj
1147*38fd1498Szrj bsi = gsi_for_stmt (ref->stmt);
1148*38fd1498Szrj
1149*38fd1498Szrj n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1150*38fd1498Szrj / ref->prefetch_mod);
1151*38fd1498Szrj addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1152*38fd1498Szrj addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1153*38fd1498Szrj true, NULL, true, GSI_SAME_STMT);
1154*38fd1498Szrj write_p = ref->write_p ? integer_one_node : integer_zero_node;
1155*38fd1498Szrj local = nontemporal ? integer_zero_node : integer_three_node;
1156*38fd1498Szrj
1157*38fd1498Szrj for (ap = 0; ap < n_prefetches; ap++)
1158*38fd1498Szrj {
1159*38fd1498Szrj if (cst_and_fits_in_hwi (ref->group->step))
1160*38fd1498Szrj {
1161*38fd1498Szrj /* Determine the address to prefetch. */
1162*38fd1498Szrj delta = (ahead + ap * ref->prefetch_mod) *
1163*38fd1498Szrj int_cst_value (ref->group->step);
1164*38fd1498Szrj addr = fold_build_pointer_plus_hwi (addr_base, delta);
1165*38fd1498Szrj addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1166*38fd1498Szrj NULL, true, GSI_SAME_STMT);
1167*38fd1498Szrj }
1168*38fd1498Szrj else
1169*38fd1498Szrj {
1170*38fd1498Szrj /* The step size is non-constant but loop-invariant. We use the
1171*38fd1498Szrj heuristic to simply prefetch ahead iterations ahead. */
1172*38fd1498Szrj forward = fold_build2 (MULT_EXPR, sizetype,
1173*38fd1498Szrj fold_convert (sizetype, ref->group->step),
1174*38fd1498Szrj fold_convert (sizetype, size_int (ahead)));
1175*38fd1498Szrj addr = fold_build_pointer_plus (addr_base, forward);
1176*38fd1498Szrj addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1177*38fd1498Szrj NULL, true, GSI_SAME_STMT);
1178*38fd1498Szrj }
1179*38fd1498Szrj
1180*38fd1498Szrj if (addr_base != addr
1181*38fd1498Szrj && TREE_CODE (addr_base) == SSA_NAME
1182*38fd1498Szrj && TREE_CODE (addr) == SSA_NAME)
1183*38fd1498Szrj {
1184*38fd1498Szrj duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1185*38fd1498Szrj /* As this isn't a plain copy we have to reset alignment
1186*38fd1498Szrj information. */
1187*38fd1498Szrj if (SSA_NAME_PTR_INFO (addr))
1188*38fd1498Szrj mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1189*38fd1498Szrj }
1190*38fd1498Szrj
1191*38fd1498Szrj /* Create the prefetch instruction. */
1192*38fd1498Szrj prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1193*38fd1498Szrj 3, addr, write_p, local);
1194*38fd1498Szrj gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1195*38fd1498Szrj }
1196*38fd1498Szrj }
1197*38fd1498Szrj
1198*38fd1498Szrj /* Issue prefetches for the references in GROUPS into loop as decided before.
1199*38fd1498Szrj HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1200*38fd1498Szrj factor by that LOOP was unrolled. */
1201*38fd1498Szrj
1202*38fd1498Szrj static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)1203*38fd1498Szrj issue_prefetches (struct mem_ref_group *groups,
1204*38fd1498Szrj unsigned unroll_factor, unsigned ahead)
1205*38fd1498Szrj {
1206*38fd1498Szrj struct mem_ref *ref;
1207*38fd1498Szrj
1208*38fd1498Szrj for (; groups; groups = groups->next)
1209*38fd1498Szrj for (ref = groups->refs; ref; ref = ref->next)
1210*38fd1498Szrj if (ref->issue_prefetch_p)
1211*38fd1498Szrj issue_prefetch_ref (ref, unroll_factor, ahead);
1212*38fd1498Szrj }
1213*38fd1498Szrj
1214*38fd1498Szrj /* Returns true if REF is a memory write for that a nontemporal store insn
1215*38fd1498Szrj can be used. */
1216*38fd1498Szrj
1217*38fd1498Szrj static bool
nontemporal_store_p(struct mem_ref * ref)1218*38fd1498Szrj nontemporal_store_p (struct mem_ref *ref)
1219*38fd1498Szrj {
1220*38fd1498Szrj machine_mode mode;
1221*38fd1498Szrj enum insn_code code;
1222*38fd1498Szrj
1223*38fd1498Szrj /* REF must be a write that is not reused. We require it to be independent
1224*38fd1498Szrj on all other memory references in the loop, as the nontemporal stores may
1225*38fd1498Szrj be reordered with respect to other memory references. */
1226*38fd1498Szrj if (!ref->write_p
1227*38fd1498Szrj || !ref->independent_p
1228*38fd1498Szrj || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1229*38fd1498Szrj return false;
1230*38fd1498Szrj
1231*38fd1498Szrj /* Check that we have the storent instruction for the mode. */
1232*38fd1498Szrj mode = TYPE_MODE (TREE_TYPE (ref->mem));
1233*38fd1498Szrj if (mode == BLKmode)
1234*38fd1498Szrj return false;
1235*38fd1498Szrj
1236*38fd1498Szrj code = optab_handler (storent_optab, mode);
1237*38fd1498Szrj return code != CODE_FOR_nothing;
1238*38fd1498Szrj }
1239*38fd1498Szrj
1240*38fd1498Szrj /* If REF is a nontemporal store, we mark the corresponding modify statement
1241*38fd1498Szrj and return true. Otherwise, we return false. */
1242*38fd1498Szrj
1243*38fd1498Szrj static bool
mark_nontemporal_store(struct mem_ref * ref)1244*38fd1498Szrj mark_nontemporal_store (struct mem_ref *ref)
1245*38fd1498Szrj {
1246*38fd1498Szrj if (!nontemporal_store_p (ref))
1247*38fd1498Szrj return false;
1248*38fd1498Szrj
1249*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1250*38fd1498Szrj fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1251*38fd1498Szrj ref->group->uid, ref->uid);
1252*38fd1498Szrj
1253*38fd1498Szrj gimple_assign_set_nontemporal_move (ref->stmt, true);
1254*38fd1498Szrj ref->storent_p = true;
1255*38fd1498Szrj
1256*38fd1498Szrj return true;
1257*38fd1498Szrj }
1258*38fd1498Szrj
1259*38fd1498Szrj /* Issue a memory fence instruction after LOOP. */
1260*38fd1498Szrj
1261*38fd1498Szrj static void
emit_mfence_after_loop(struct loop * loop)1262*38fd1498Szrj emit_mfence_after_loop (struct loop *loop)
1263*38fd1498Szrj {
1264*38fd1498Szrj vec<edge> exits = get_loop_exit_edges (loop);
1265*38fd1498Szrj edge exit;
1266*38fd1498Szrj gcall *call;
1267*38fd1498Szrj gimple_stmt_iterator bsi;
1268*38fd1498Szrj unsigned i;
1269*38fd1498Szrj
1270*38fd1498Szrj FOR_EACH_VEC_ELT (exits, i, exit)
1271*38fd1498Szrj {
1272*38fd1498Szrj call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1273*38fd1498Szrj
1274*38fd1498Szrj if (!single_pred_p (exit->dest)
1275*38fd1498Szrj /* If possible, we prefer not to insert the fence on other paths
1276*38fd1498Szrj in cfg. */
1277*38fd1498Szrj && !(exit->flags & EDGE_ABNORMAL))
1278*38fd1498Szrj split_loop_exit_edge (exit);
1279*38fd1498Szrj bsi = gsi_after_labels (exit->dest);
1280*38fd1498Szrj
1281*38fd1498Szrj gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1282*38fd1498Szrj }
1283*38fd1498Szrj
1284*38fd1498Szrj exits.release ();
1285*38fd1498Szrj update_ssa (TODO_update_ssa_only_virtuals);
1286*38fd1498Szrj }
1287*38fd1498Szrj
1288*38fd1498Szrj /* Returns true if we can use storent in loop, false otherwise. */
1289*38fd1498Szrj
1290*38fd1498Szrj static bool
may_use_storent_in_loop_p(struct loop * loop)1291*38fd1498Szrj may_use_storent_in_loop_p (struct loop *loop)
1292*38fd1498Szrj {
1293*38fd1498Szrj bool ret = true;
1294*38fd1498Szrj
1295*38fd1498Szrj if (loop->inner != NULL)
1296*38fd1498Szrj return false;
1297*38fd1498Szrj
1298*38fd1498Szrj /* If we must issue a mfence insn after using storent, check that there
1299*38fd1498Szrj is a suitable place for it at each of the loop exits. */
1300*38fd1498Szrj if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1301*38fd1498Szrj {
1302*38fd1498Szrj vec<edge> exits = get_loop_exit_edges (loop);
1303*38fd1498Szrj unsigned i;
1304*38fd1498Szrj edge exit;
1305*38fd1498Szrj
1306*38fd1498Szrj FOR_EACH_VEC_ELT (exits, i, exit)
1307*38fd1498Szrj if ((exit->flags & EDGE_ABNORMAL)
1308*38fd1498Szrj && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1309*38fd1498Szrj ret = false;
1310*38fd1498Szrj
1311*38fd1498Szrj exits.release ();
1312*38fd1498Szrj }
1313*38fd1498Szrj
1314*38fd1498Szrj return ret;
1315*38fd1498Szrj }
1316*38fd1498Szrj
1317*38fd1498Szrj /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1318*38fd1498Szrj references in the loop. */
1319*38fd1498Szrj
1320*38fd1498Szrj static void
mark_nontemporal_stores(struct loop * loop,struct mem_ref_group * groups)1321*38fd1498Szrj mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1322*38fd1498Szrj {
1323*38fd1498Szrj struct mem_ref *ref;
1324*38fd1498Szrj bool any = false;
1325*38fd1498Szrj
1326*38fd1498Szrj if (!may_use_storent_in_loop_p (loop))
1327*38fd1498Szrj return;
1328*38fd1498Szrj
1329*38fd1498Szrj for (; groups; groups = groups->next)
1330*38fd1498Szrj for (ref = groups->refs; ref; ref = ref->next)
1331*38fd1498Szrj any |= mark_nontemporal_store (ref);
1332*38fd1498Szrj
1333*38fd1498Szrj if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1334*38fd1498Szrj emit_mfence_after_loop (loop);
1335*38fd1498Szrj }
1336*38fd1498Szrj
1337*38fd1498Szrj /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1338*38fd1498Szrj this is the case, fill in DESC by the description of number of
1339*38fd1498Szrj iterations. */
1340*38fd1498Szrj
1341*38fd1498Szrj static bool
should_unroll_loop_p(struct loop * loop,struct tree_niter_desc * desc,unsigned factor)1342*38fd1498Szrj should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1343*38fd1498Szrj unsigned factor)
1344*38fd1498Szrj {
1345*38fd1498Szrj if (!can_unroll_loop_p (loop, factor, desc))
1346*38fd1498Szrj return false;
1347*38fd1498Szrj
1348*38fd1498Szrj /* We only consider loops without control flow for unrolling. This is not
1349*38fd1498Szrj a hard restriction -- tree_unroll_loop works with arbitrary loops
1350*38fd1498Szrj as well; but the unrolling/prefetching is usually more profitable for
1351*38fd1498Szrj loops consisting of a single basic block, and we want to limit the
1352*38fd1498Szrj code growth. */
1353*38fd1498Szrj if (loop->num_nodes > 2)
1354*38fd1498Szrj return false;
1355*38fd1498Szrj
1356*38fd1498Szrj return true;
1357*38fd1498Szrj }
1358*38fd1498Szrj
1359*38fd1498Szrj /* Determine the coefficient by that unroll LOOP, from the information
1360*38fd1498Szrj contained in the list of memory references REFS. Description of
1361*38fd1498Szrj number of iterations of LOOP is stored to DESC. NINSNS is the number of
1362*38fd1498Szrj insns of the LOOP. EST_NITER is the estimated number of iterations of
1363*38fd1498Szrj the loop, or -1 if no estimate is available. */
1364*38fd1498Szrj
1365*38fd1498Szrj static unsigned
determine_unroll_factor(struct loop * loop,struct mem_ref_group * refs,unsigned ninsns,struct tree_niter_desc * desc,HOST_WIDE_INT est_niter)1366*38fd1498Szrj determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1367*38fd1498Szrj unsigned ninsns, struct tree_niter_desc *desc,
1368*38fd1498Szrj HOST_WIDE_INT est_niter)
1369*38fd1498Szrj {
1370*38fd1498Szrj unsigned upper_bound;
1371*38fd1498Szrj unsigned nfactor, factor, mod_constraint;
1372*38fd1498Szrj struct mem_ref_group *agp;
1373*38fd1498Szrj struct mem_ref *ref;
1374*38fd1498Szrj
1375*38fd1498Szrj /* First check whether the loop is not too large to unroll. We ignore
1376*38fd1498Szrj PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1377*38fd1498Szrj from unrolling them enough to make exactly one cache line covered by each
1378*38fd1498Szrj iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1379*38fd1498Szrj us from unrolling the loops too many times in cases where we only expect
1380*38fd1498Szrj gains from better scheduling and decreasing loop overhead, which is not
1381*38fd1498Szrj the case here. */
1382*38fd1498Szrj upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1383*38fd1498Szrj
1384*38fd1498Szrj /* If we unrolled the loop more times than it iterates, the unrolled version
1385*38fd1498Szrj of the loop would be never entered. */
1386*38fd1498Szrj if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1387*38fd1498Szrj upper_bound = est_niter;
1388*38fd1498Szrj
1389*38fd1498Szrj if (upper_bound <= 1)
1390*38fd1498Szrj return 1;
1391*38fd1498Szrj
1392*38fd1498Szrj /* Choose the factor so that we may prefetch each cache just once,
1393*38fd1498Szrj but bound the unrolling by UPPER_BOUND. */
1394*38fd1498Szrj factor = 1;
1395*38fd1498Szrj for (agp = refs; agp; agp = agp->next)
1396*38fd1498Szrj for (ref = agp->refs; ref; ref = ref->next)
1397*38fd1498Szrj if (should_issue_prefetch_p (ref))
1398*38fd1498Szrj {
1399*38fd1498Szrj mod_constraint = ref->prefetch_mod;
1400*38fd1498Szrj nfactor = least_common_multiple (mod_constraint, factor);
1401*38fd1498Szrj if (nfactor <= upper_bound)
1402*38fd1498Szrj factor = nfactor;
1403*38fd1498Szrj }
1404*38fd1498Szrj
1405*38fd1498Szrj if (!should_unroll_loop_p (loop, desc, factor))
1406*38fd1498Szrj return 1;
1407*38fd1498Szrj
1408*38fd1498Szrj return factor;
1409*38fd1498Szrj }
1410*38fd1498Szrj
1411*38fd1498Szrj /* Returns the total volume of the memory references REFS, taking into account
1412*38fd1498Szrj reuses in the innermost loop and cache line size. TODO -- we should also
1413*38fd1498Szrj take into account reuses across the iterations of the loops in the loop
1414*38fd1498Szrj nest. */
1415*38fd1498Szrj
1416*38fd1498Szrj static unsigned
volume_of_references(struct mem_ref_group * refs)1417*38fd1498Szrj volume_of_references (struct mem_ref_group *refs)
1418*38fd1498Szrj {
1419*38fd1498Szrj unsigned volume = 0;
1420*38fd1498Szrj struct mem_ref_group *gr;
1421*38fd1498Szrj struct mem_ref *ref;
1422*38fd1498Szrj
1423*38fd1498Szrj for (gr = refs; gr; gr = gr->next)
1424*38fd1498Szrj for (ref = gr->refs; ref; ref = ref->next)
1425*38fd1498Szrj {
1426*38fd1498Szrj /* Almost always reuses another value? */
1427*38fd1498Szrj if (ref->prefetch_before != PREFETCH_ALL)
1428*38fd1498Szrj continue;
1429*38fd1498Szrj
1430*38fd1498Szrj /* If several iterations access the same cache line, use the size of
1431*38fd1498Szrj the line divided by this number. Otherwise, a cache line is
1432*38fd1498Szrj accessed in each iteration. TODO -- in the latter case, we should
1433*38fd1498Szrj take the size of the reference into account, rounding it up on cache
1434*38fd1498Szrj line size multiple. */
1435*38fd1498Szrj volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1436*38fd1498Szrj }
1437*38fd1498Szrj return volume;
1438*38fd1498Szrj }
1439*38fd1498Szrj
1440*38fd1498Szrj /* Returns the volume of memory references accessed across VEC iterations of
1441*38fd1498Szrj loops, whose sizes are described in the LOOP_SIZES array. N is the number
1442*38fd1498Szrj of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1443*38fd1498Szrj
1444*38fd1498Szrj static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)1445*38fd1498Szrj volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1446*38fd1498Szrj {
1447*38fd1498Szrj unsigned i;
1448*38fd1498Szrj
1449*38fd1498Szrj for (i = 0; i < n; i++)
1450*38fd1498Szrj if (vec[i] != 0)
1451*38fd1498Szrj break;
1452*38fd1498Szrj
1453*38fd1498Szrj if (i == n)
1454*38fd1498Szrj return 0;
1455*38fd1498Szrj
1456*38fd1498Szrj gcc_assert (vec[i] > 0);
1457*38fd1498Szrj
1458*38fd1498Szrj /* We ignore the parts of the distance vector in subloops, since usually
1459*38fd1498Szrj the numbers of iterations are much smaller. */
1460*38fd1498Szrj return loop_sizes[i] * vec[i];
1461*38fd1498Szrj }
1462*38fd1498Szrj
1463*38fd1498Szrj /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1464*38fd1498Szrj at the position corresponding to the loop of the step. N is the depth
1465*38fd1498Szrj of the considered loop nest, and, LOOP is its innermost loop. */
1466*38fd1498Szrj
1467*38fd1498Szrj static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,struct loop * loop)1468*38fd1498Szrj add_subscript_strides (tree access_fn, unsigned stride,
1469*38fd1498Szrj HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1470*38fd1498Szrj {
1471*38fd1498Szrj struct loop *aloop;
1472*38fd1498Szrj tree step;
1473*38fd1498Szrj HOST_WIDE_INT astep;
1474*38fd1498Szrj unsigned min_depth = loop_depth (loop) - n;
1475*38fd1498Szrj
1476*38fd1498Szrj while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1477*38fd1498Szrj {
1478*38fd1498Szrj aloop = get_chrec_loop (access_fn);
1479*38fd1498Szrj step = CHREC_RIGHT (access_fn);
1480*38fd1498Szrj access_fn = CHREC_LEFT (access_fn);
1481*38fd1498Szrj
1482*38fd1498Szrj if ((unsigned) loop_depth (aloop) <= min_depth)
1483*38fd1498Szrj continue;
1484*38fd1498Szrj
1485*38fd1498Szrj if (tree_fits_shwi_p (step))
1486*38fd1498Szrj astep = tree_to_shwi (step);
1487*38fd1498Szrj else
1488*38fd1498Szrj astep = L1_CACHE_LINE_SIZE;
1489*38fd1498Szrj
1490*38fd1498Szrj strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1491*38fd1498Szrj
1492*38fd1498Szrj }
1493*38fd1498Szrj }
1494*38fd1498Szrj
1495*38fd1498Szrj /* Returns the volume of memory references accessed between two consecutive
1496*38fd1498Szrj self-reuses of the reference DR. We consider the subscripts of DR in N
1497*38fd1498Szrj loops, and LOOP_SIZES contains the volumes of accesses in each of the
1498*38fd1498Szrj loops. LOOP is the innermost loop of the current loop nest. */
1499*38fd1498Szrj
1500*38fd1498Szrj static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,struct loop * loop)1501*38fd1498Szrj self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1502*38fd1498Szrj struct loop *loop)
1503*38fd1498Szrj {
1504*38fd1498Szrj tree stride, access_fn;
1505*38fd1498Szrj HOST_WIDE_INT *strides, astride;
1506*38fd1498Szrj vec<tree> access_fns;
1507*38fd1498Szrj tree ref = DR_REF (dr);
1508*38fd1498Szrj unsigned i, ret = ~0u;
1509*38fd1498Szrj
1510*38fd1498Szrj /* In the following example:
1511*38fd1498Szrj
1512*38fd1498Szrj for (i = 0; i < N; i++)
1513*38fd1498Szrj for (j = 0; j < N; j++)
1514*38fd1498Szrj use (a[j][i]);
1515*38fd1498Szrj the same cache line is accessed each N steps (except if the change from
1516*38fd1498Szrj i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1517*38fd1498Szrj we cannot rely purely on the results of the data dependence analysis.
1518*38fd1498Szrj
1519*38fd1498Szrj Instead, we compute the stride of the reference in each loop, and consider
1520*38fd1498Szrj the innermost loop in that the stride is less than cache size. */
1521*38fd1498Szrj
1522*38fd1498Szrj strides = XCNEWVEC (HOST_WIDE_INT, n);
1523*38fd1498Szrj access_fns = DR_ACCESS_FNS (dr);
1524*38fd1498Szrj
1525*38fd1498Szrj FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1526*38fd1498Szrj {
1527*38fd1498Szrj /* Keep track of the reference corresponding to the subscript, so that we
1528*38fd1498Szrj know its stride. */
1529*38fd1498Szrj while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1530*38fd1498Szrj ref = TREE_OPERAND (ref, 0);
1531*38fd1498Szrj
1532*38fd1498Szrj if (TREE_CODE (ref) == ARRAY_REF)
1533*38fd1498Szrj {
1534*38fd1498Szrj stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1535*38fd1498Szrj if (tree_fits_uhwi_p (stride))
1536*38fd1498Szrj astride = tree_to_uhwi (stride);
1537*38fd1498Szrj else
1538*38fd1498Szrj astride = L1_CACHE_LINE_SIZE;
1539*38fd1498Szrj
1540*38fd1498Szrj ref = TREE_OPERAND (ref, 0);
1541*38fd1498Szrj }
1542*38fd1498Szrj else
1543*38fd1498Szrj astride = 1;
1544*38fd1498Szrj
1545*38fd1498Szrj add_subscript_strides (access_fn, astride, strides, n, loop);
1546*38fd1498Szrj }
1547*38fd1498Szrj
1548*38fd1498Szrj for (i = n; i-- > 0; )
1549*38fd1498Szrj {
1550*38fd1498Szrj unsigned HOST_WIDE_INT s;
1551*38fd1498Szrj
1552*38fd1498Szrj s = strides[i] < 0 ? -strides[i] : strides[i];
1553*38fd1498Szrj
1554*38fd1498Szrj if (s < (unsigned) L1_CACHE_LINE_SIZE
1555*38fd1498Szrj && (loop_sizes[i]
1556*38fd1498Szrj > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1557*38fd1498Szrj {
1558*38fd1498Szrj ret = loop_sizes[i];
1559*38fd1498Szrj break;
1560*38fd1498Szrj }
1561*38fd1498Szrj }
1562*38fd1498Szrj
1563*38fd1498Szrj free (strides);
1564*38fd1498Szrj return ret;
1565*38fd1498Szrj }
1566*38fd1498Szrj
1567*38fd1498Szrj /* Determines the distance till the first reuse of each reference in REFS
1568*38fd1498Szrj in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1569*38fd1498Szrj memory references in the loop. Return false if the analysis fails. */
1570*38fd1498Szrj
1571*38fd1498Szrj static bool
determine_loop_nest_reuse(struct loop * loop,struct mem_ref_group * refs,bool no_other_refs)1572*38fd1498Szrj determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1573*38fd1498Szrj bool no_other_refs)
1574*38fd1498Szrj {
1575*38fd1498Szrj struct loop *nest, *aloop;
1576*38fd1498Szrj vec<data_reference_p> datarefs = vNULL;
1577*38fd1498Szrj vec<ddr_p> dependences = vNULL;
1578*38fd1498Szrj struct mem_ref_group *gr;
1579*38fd1498Szrj struct mem_ref *ref, *refb;
1580*38fd1498Szrj auto_vec<loop_p> vloops;
1581*38fd1498Szrj unsigned *loop_data_size;
1582*38fd1498Szrj unsigned i, j, n;
1583*38fd1498Szrj unsigned volume, dist, adist;
1584*38fd1498Szrj HOST_WIDE_INT vol;
1585*38fd1498Szrj data_reference_p dr;
1586*38fd1498Szrj ddr_p dep;
1587*38fd1498Szrj
1588*38fd1498Szrj if (loop->inner)
1589*38fd1498Szrj return true;
1590*38fd1498Szrj
1591*38fd1498Szrj /* Find the outermost loop of the loop nest of loop (we require that
1592*38fd1498Szrj there are no sibling loops inside the nest). */
1593*38fd1498Szrj nest = loop;
1594*38fd1498Szrj while (1)
1595*38fd1498Szrj {
1596*38fd1498Szrj aloop = loop_outer (nest);
1597*38fd1498Szrj
1598*38fd1498Szrj if (aloop == current_loops->tree_root
1599*38fd1498Szrj || aloop->inner->next)
1600*38fd1498Szrj break;
1601*38fd1498Szrj
1602*38fd1498Szrj nest = aloop;
1603*38fd1498Szrj }
1604*38fd1498Szrj
1605*38fd1498Szrj /* For each loop, determine the amount of data accessed in each iteration.
1606*38fd1498Szrj We use this to estimate whether the reference is evicted from the
1607*38fd1498Szrj cache before its reuse. */
1608*38fd1498Szrj find_loop_nest (nest, &vloops);
1609*38fd1498Szrj n = vloops.length ();
1610*38fd1498Szrj loop_data_size = XNEWVEC (unsigned, n);
1611*38fd1498Szrj volume = volume_of_references (refs);
1612*38fd1498Szrj i = n;
1613*38fd1498Szrj while (i-- != 0)
1614*38fd1498Szrj {
1615*38fd1498Szrj loop_data_size[i] = volume;
1616*38fd1498Szrj /* Bound the volume by the L2 cache size, since above this bound,
1617*38fd1498Szrj all dependence distances are equivalent. */
1618*38fd1498Szrj if (volume > L2_CACHE_SIZE_BYTES)
1619*38fd1498Szrj continue;
1620*38fd1498Szrj
1621*38fd1498Szrj aloop = vloops[i];
1622*38fd1498Szrj vol = estimated_stmt_executions_int (aloop);
1623*38fd1498Szrj if (vol == -1)
1624*38fd1498Szrj vol = expected_loop_iterations (aloop);
1625*38fd1498Szrj volume *= vol;
1626*38fd1498Szrj }
1627*38fd1498Szrj
1628*38fd1498Szrj /* Prepare the references in the form suitable for data dependence
1629*38fd1498Szrj analysis. We ignore unanalyzable data references (the results
1630*38fd1498Szrj are used just as a heuristics to estimate temporality of the
1631*38fd1498Szrj references, hence we do not need to worry about correctness). */
1632*38fd1498Szrj for (gr = refs; gr; gr = gr->next)
1633*38fd1498Szrj for (ref = gr->refs; ref; ref = ref->next)
1634*38fd1498Szrj {
1635*38fd1498Szrj dr = create_data_ref (loop_preheader_edge (nest),
1636*38fd1498Szrj loop_containing_stmt (ref->stmt),
1637*38fd1498Szrj ref->mem, ref->stmt, !ref->write_p, false);
1638*38fd1498Szrj
1639*38fd1498Szrj if (dr)
1640*38fd1498Szrj {
1641*38fd1498Szrj ref->reuse_distance = volume;
1642*38fd1498Szrj dr->aux = ref;
1643*38fd1498Szrj datarefs.safe_push (dr);
1644*38fd1498Szrj }
1645*38fd1498Szrj else
1646*38fd1498Szrj no_other_refs = false;
1647*38fd1498Szrj }
1648*38fd1498Szrj
1649*38fd1498Szrj FOR_EACH_VEC_ELT (datarefs, i, dr)
1650*38fd1498Szrj {
1651*38fd1498Szrj dist = self_reuse_distance (dr, loop_data_size, n, loop);
1652*38fd1498Szrj ref = (struct mem_ref *) dr->aux;
1653*38fd1498Szrj if (ref->reuse_distance > dist)
1654*38fd1498Szrj ref->reuse_distance = dist;
1655*38fd1498Szrj
1656*38fd1498Szrj if (no_other_refs)
1657*38fd1498Szrj ref->independent_p = true;
1658*38fd1498Szrj }
1659*38fd1498Szrj
1660*38fd1498Szrj if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1661*38fd1498Szrj return false;
1662*38fd1498Szrj
1663*38fd1498Szrj FOR_EACH_VEC_ELT (dependences, i, dep)
1664*38fd1498Szrj {
1665*38fd1498Szrj if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1666*38fd1498Szrj continue;
1667*38fd1498Szrj
1668*38fd1498Szrj ref = (struct mem_ref *) DDR_A (dep)->aux;
1669*38fd1498Szrj refb = (struct mem_ref *) DDR_B (dep)->aux;
1670*38fd1498Szrj
1671*38fd1498Szrj if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1672*38fd1498Szrj || DDR_COULD_BE_INDEPENDENT_P (dep)
1673*38fd1498Szrj || DDR_NUM_DIST_VECTS (dep) == 0)
1674*38fd1498Szrj {
1675*38fd1498Szrj /* If the dependence cannot be analyzed, assume that there might be
1676*38fd1498Szrj a reuse. */
1677*38fd1498Szrj dist = 0;
1678*38fd1498Szrj
1679*38fd1498Szrj ref->independent_p = false;
1680*38fd1498Szrj refb->independent_p = false;
1681*38fd1498Szrj }
1682*38fd1498Szrj else
1683*38fd1498Szrj {
1684*38fd1498Szrj /* The distance vectors are normalized to be always lexicographically
1685*38fd1498Szrj positive, hence we cannot tell just from them whether DDR_A comes
1686*38fd1498Szrj before DDR_B or vice versa. However, it is not important,
1687*38fd1498Szrj anyway -- if DDR_A is close to DDR_B, then it is either reused in
1688*38fd1498Szrj DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1689*38fd1498Szrj in cache (and marking it as nontemporal would not affect
1690*38fd1498Szrj anything). */
1691*38fd1498Szrj
1692*38fd1498Szrj dist = volume;
1693*38fd1498Szrj for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1694*38fd1498Szrj {
1695*38fd1498Szrj adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1696*38fd1498Szrj loop_data_size, n);
1697*38fd1498Szrj
1698*38fd1498Szrj /* If this is a dependence in the innermost loop (i.e., the
1699*38fd1498Szrj distances in all superloops are zero) and it is not
1700*38fd1498Szrj the trivial self-dependence with distance zero, record that
1701*38fd1498Szrj the references are not completely independent. */
1702*38fd1498Szrj if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1703*38fd1498Szrj && (ref != refb
1704*38fd1498Szrj || DDR_DIST_VECT (dep, j)[n-1] != 0))
1705*38fd1498Szrj {
1706*38fd1498Szrj ref->independent_p = false;
1707*38fd1498Szrj refb->independent_p = false;
1708*38fd1498Szrj }
1709*38fd1498Szrj
1710*38fd1498Szrj /* Ignore accesses closer than
1711*38fd1498Szrj L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1712*38fd1498Szrj so that we use nontemporal prefetches e.g. if single memory
1713*38fd1498Szrj location is accessed several times in a single iteration of
1714*38fd1498Szrj the loop. */
1715*38fd1498Szrj if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1716*38fd1498Szrj continue;
1717*38fd1498Szrj
1718*38fd1498Szrj if (adist < dist)
1719*38fd1498Szrj dist = adist;
1720*38fd1498Szrj }
1721*38fd1498Szrj }
1722*38fd1498Szrj
1723*38fd1498Szrj if (ref->reuse_distance > dist)
1724*38fd1498Szrj ref->reuse_distance = dist;
1725*38fd1498Szrj if (refb->reuse_distance > dist)
1726*38fd1498Szrj refb->reuse_distance = dist;
1727*38fd1498Szrj }
1728*38fd1498Szrj
1729*38fd1498Szrj free_dependence_relations (dependences);
1730*38fd1498Szrj free_data_refs (datarefs);
1731*38fd1498Szrj free (loop_data_size);
1732*38fd1498Szrj
1733*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1734*38fd1498Szrj {
1735*38fd1498Szrj fprintf (dump_file, "Reuse distances:\n");
1736*38fd1498Szrj for (gr = refs; gr; gr = gr->next)
1737*38fd1498Szrj for (ref = gr->refs; ref; ref = ref->next)
1738*38fd1498Szrj fprintf (dump_file, " reference %u:%u distance %u\n",
1739*38fd1498Szrj ref->group->uid, ref->uid, ref->reuse_distance);
1740*38fd1498Szrj }
1741*38fd1498Szrj
1742*38fd1498Szrj return true;
1743*38fd1498Szrj }
1744*38fd1498Szrj
1745*38fd1498Szrj /* Determine whether or not the trip count to ahead ratio is too small based
1746*38fd1498Szrj on prefitablility consideration.
1747*38fd1498Szrj AHEAD: the iteration ahead distance,
1748*38fd1498Szrj EST_NITER: the estimated trip count. */
1749*38fd1498Szrj
1750*38fd1498Szrj static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)1751*38fd1498Szrj trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1752*38fd1498Szrj {
1753*38fd1498Szrj /* Assume trip count to ahead ratio is big enough if the trip count could not
1754*38fd1498Szrj be estimated at compile time. */
1755*38fd1498Szrj if (est_niter < 0)
1756*38fd1498Szrj return false;
1757*38fd1498Szrj
1758*38fd1498Szrj if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1759*38fd1498Szrj {
1760*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1761*38fd1498Szrj fprintf (dump_file,
1762*38fd1498Szrj "Not prefetching -- loop estimated to roll only %d times\n",
1763*38fd1498Szrj (int) est_niter);
1764*38fd1498Szrj return true;
1765*38fd1498Szrj }
1766*38fd1498Szrj
1767*38fd1498Szrj return false;
1768*38fd1498Szrj }
1769*38fd1498Szrj
1770*38fd1498Szrj /* Determine whether or not the number of memory references in the loop is
1771*38fd1498Szrj reasonable based on the profitablity and compilation time considerations.
1772*38fd1498Szrj NINSNS: estimated number of instructions in the loop,
1773*38fd1498Szrj MEM_REF_COUNT: total number of memory references in the loop. */
1774*38fd1498Szrj
1775*38fd1498Szrj static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)1776*38fd1498Szrj mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1777*38fd1498Szrj {
1778*38fd1498Szrj int insn_to_mem_ratio;
1779*38fd1498Szrj
1780*38fd1498Szrj if (mem_ref_count == 0)
1781*38fd1498Szrj return false;
1782*38fd1498Szrj
1783*38fd1498Szrj /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1784*38fd1498Szrj (compute_all_dependences) have high costs based on quadratic complexity.
1785*38fd1498Szrj To avoid huge compilation time, we give up prefetching if mem_ref_count
1786*38fd1498Szrj is too large. */
1787*38fd1498Szrj if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1788*38fd1498Szrj return false;
1789*38fd1498Szrj
1790*38fd1498Szrj /* Prefetching improves performance by overlapping cache missing
1791*38fd1498Szrj memory accesses with CPU operations. If the loop does not have
1792*38fd1498Szrj enough CPU operations to overlap with memory operations, prefetching
1793*38fd1498Szrj won't give a significant benefit. One approximate way of checking
1794*38fd1498Szrj this is to require the ratio of instructions to memory references to
1795*38fd1498Szrj be above a certain limit. This approximation works well in practice.
1796*38fd1498Szrj TODO: Implement a more precise computation by estimating the time
1797*38fd1498Szrj for each CPU or memory op in the loop. Time estimates for memory ops
1798*38fd1498Szrj should account for cache misses. */
1799*38fd1498Szrj insn_to_mem_ratio = ninsns / mem_ref_count;
1800*38fd1498Szrj
1801*38fd1498Szrj if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1802*38fd1498Szrj {
1803*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1804*38fd1498Szrj fprintf (dump_file,
1805*38fd1498Szrj "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1806*38fd1498Szrj insn_to_mem_ratio);
1807*38fd1498Szrj return false;
1808*38fd1498Szrj }
1809*38fd1498Szrj
1810*38fd1498Szrj return true;
1811*38fd1498Szrj }
1812*38fd1498Szrj
1813*38fd1498Szrj /* Determine whether or not the instruction to prefetch ratio in the loop is
1814*38fd1498Szrj too small based on the profitablity consideration.
1815*38fd1498Szrj NINSNS: estimated number of instructions in the loop,
1816*38fd1498Szrj PREFETCH_COUNT: an estimate of the number of prefetches,
1817*38fd1498Szrj UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1818*38fd1498Szrj
1819*38fd1498Szrj static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)1820*38fd1498Szrj insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1821*38fd1498Szrj unsigned unroll_factor)
1822*38fd1498Szrj {
1823*38fd1498Szrj int insn_to_prefetch_ratio;
1824*38fd1498Szrj
1825*38fd1498Szrj /* Prefetching most likely causes performance degradation when the instruction
1826*38fd1498Szrj to prefetch ratio is too small. Too many prefetch instructions in a loop
1827*38fd1498Szrj may reduce the I-cache performance.
1828*38fd1498Szrj (unroll_factor * ninsns) is used to estimate the number of instructions in
1829*38fd1498Szrj the unrolled loop. This implementation is a bit simplistic -- the number
1830*38fd1498Szrj of issued prefetch instructions is also affected by unrolling. So,
1831*38fd1498Szrj prefetch_mod and the unroll factor should be taken into account when
1832*38fd1498Szrj determining prefetch_count. Also, the number of insns of the unrolled
1833*38fd1498Szrj loop will usually be significantly smaller than the number of insns of the
1834*38fd1498Szrj original loop * unroll_factor (at least the induction variable increases
1835*38fd1498Szrj and the exit branches will get eliminated), so it might be better to use
1836*38fd1498Szrj tree_estimate_loop_size + estimated_unrolled_size. */
1837*38fd1498Szrj insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1838*38fd1498Szrj if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1839*38fd1498Szrj {
1840*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1841*38fd1498Szrj fprintf (dump_file,
1842*38fd1498Szrj "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1843*38fd1498Szrj insn_to_prefetch_ratio);
1844*38fd1498Szrj return true;
1845*38fd1498Szrj }
1846*38fd1498Szrj
1847*38fd1498Szrj return false;
1848*38fd1498Szrj }
1849*38fd1498Szrj
1850*38fd1498Szrj
1851*38fd1498Szrj /* Issue prefetch instructions for array references in LOOP. Returns
1852*38fd1498Szrj true if the LOOP was unrolled. */
1853*38fd1498Szrj
1854*38fd1498Szrj static bool
loop_prefetch_arrays(struct loop * loop)1855*38fd1498Szrj loop_prefetch_arrays (struct loop *loop)
1856*38fd1498Szrj {
1857*38fd1498Szrj struct mem_ref_group *refs;
1858*38fd1498Szrj unsigned ahead, ninsns, time, unroll_factor;
1859*38fd1498Szrj HOST_WIDE_INT est_niter;
1860*38fd1498Szrj struct tree_niter_desc desc;
1861*38fd1498Szrj bool unrolled = false, no_other_refs;
1862*38fd1498Szrj unsigned prefetch_count;
1863*38fd1498Szrj unsigned mem_ref_count;
1864*38fd1498Szrj
1865*38fd1498Szrj if (optimize_loop_nest_for_size_p (loop))
1866*38fd1498Szrj {
1867*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1868*38fd1498Szrj fprintf (dump_file, " ignored (cold area)\n");
1869*38fd1498Szrj return false;
1870*38fd1498Szrj }
1871*38fd1498Szrj
1872*38fd1498Szrj /* FIXME: the time should be weighted by the probabilities of the blocks in
1873*38fd1498Szrj the loop body. */
1874*38fd1498Szrj time = tree_num_loop_insns (loop, &eni_time_weights);
1875*38fd1498Szrj if (time == 0)
1876*38fd1498Szrj return false;
1877*38fd1498Szrj
1878*38fd1498Szrj ahead = (PREFETCH_LATENCY + time - 1) / time;
1879*38fd1498Szrj est_niter = estimated_stmt_executions_int (loop);
1880*38fd1498Szrj if (est_niter == -1)
1881*38fd1498Szrj est_niter = likely_max_stmt_executions_int (loop);
1882*38fd1498Szrj
1883*38fd1498Szrj /* Prefetching is not likely to be profitable if the trip count to ahead
1884*38fd1498Szrj ratio is too small. */
1885*38fd1498Szrj if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1886*38fd1498Szrj return false;
1887*38fd1498Szrj
1888*38fd1498Szrj ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1889*38fd1498Szrj
1890*38fd1498Szrj /* Step 1: gather the memory references. */
1891*38fd1498Szrj refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1892*38fd1498Szrj
1893*38fd1498Szrj /* Give up prefetching if the number of memory references in the
1894*38fd1498Szrj loop is not reasonable based on profitablity and compilation time
1895*38fd1498Szrj considerations. */
1896*38fd1498Szrj if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1897*38fd1498Szrj goto fail;
1898*38fd1498Szrj
1899*38fd1498Szrj /* Step 2: estimate the reuse effects. */
1900*38fd1498Szrj prune_by_reuse (refs);
1901*38fd1498Szrj
1902*38fd1498Szrj if (nothing_to_prefetch_p (refs))
1903*38fd1498Szrj goto fail;
1904*38fd1498Szrj
1905*38fd1498Szrj if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1906*38fd1498Szrj goto fail;
1907*38fd1498Szrj
1908*38fd1498Szrj /* Step 3: determine unroll factor. */
1909*38fd1498Szrj unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1910*38fd1498Szrj est_niter);
1911*38fd1498Szrj
1912*38fd1498Szrj /* Estimate prefetch count for the unrolled loop. */
1913*38fd1498Szrj prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1914*38fd1498Szrj if (prefetch_count == 0)
1915*38fd1498Szrj goto fail;
1916*38fd1498Szrj
1917*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1918*38fd1498Szrj fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1919*38fd1498Szrj HOST_WIDE_INT_PRINT_DEC "\n"
1920*38fd1498Szrj "insn count %d, mem ref count %d, prefetch count %d\n",
1921*38fd1498Szrj ahead, unroll_factor, est_niter,
1922*38fd1498Szrj ninsns, mem_ref_count, prefetch_count);
1923*38fd1498Szrj
1924*38fd1498Szrj /* Prefetching is not likely to be profitable if the instruction to prefetch
1925*38fd1498Szrj ratio is too small. */
1926*38fd1498Szrj if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1927*38fd1498Szrj unroll_factor))
1928*38fd1498Szrj goto fail;
1929*38fd1498Szrj
1930*38fd1498Szrj mark_nontemporal_stores (loop, refs);
1931*38fd1498Szrj
1932*38fd1498Szrj /* Step 4: what to prefetch? */
1933*38fd1498Szrj if (!schedule_prefetches (refs, unroll_factor, ahead))
1934*38fd1498Szrj goto fail;
1935*38fd1498Szrj
1936*38fd1498Szrj /* Step 5: unroll the loop. TODO -- peeling of first and last few
1937*38fd1498Szrj iterations so that we do not issue superfluous prefetches. */
1938*38fd1498Szrj if (unroll_factor != 1)
1939*38fd1498Szrj {
1940*38fd1498Szrj tree_unroll_loop (loop, unroll_factor,
1941*38fd1498Szrj single_dom_exit (loop), &desc);
1942*38fd1498Szrj unrolled = true;
1943*38fd1498Szrj }
1944*38fd1498Szrj
1945*38fd1498Szrj /* Step 6: issue the prefetches. */
1946*38fd1498Szrj issue_prefetches (refs, unroll_factor, ahead);
1947*38fd1498Szrj
1948*38fd1498Szrj fail:
1949*38fd1498Szrj release_mem_refs (refs);
1950*38fd1498Szrj return unrolled;
1951*38fd1498Szrj }
1952*38fd1498Szrj
1953*38fd1498Szrj /* Issue prefetch instructions for array references in loops. */
1954*38fd1498Szrj
1955*38fd1498Szrj unsigned int
tree_ssa_prefetch_arrays(void)1956*38fd1498Szrj tree_ssa_prefetch_arrays (void)
1957*38fd1498Szrj {
1958*38fd1498Szrj struct loop *loop;
1959*38fd1498Szrj bool unrolled = false;
1960*38fd1498Szrj int todo_flags = 0;
1961*38fd1498Szrj
1962*38fd1498Szrj if (!targetm.have_prefetch ()
1963*38fd1498Szrj /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1964*38fd1498Szrj -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1965*38fd1498Szrj of processor costs and i486 does not have prefetch, but
1966*38fd1498Szrj -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
1967*38fd1498Szrj || PREFETCH_BLOCK == 0)
1968*38fd1498Szrj return 0;
1969*38fd1498Szrj
1970*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
1971*38fd1498Szrj {
1972*38fd1498Szrj fprintf (dump_file, "Prefetching parameters:\n");
1973*38fd1498Szrj fprintf (dump_file, " simultaneous prefetches: %d\n",
1974*38fd1498Szrj SIMULTANEOUS_PREFETCHES);
1975*38fd1498Szrj fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1976*38fd1498Szrj fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1977*38fd1498Szrj fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1978*38fd1498Szrj L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1979*38fd1498Szrj fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1980*38fd1498Szrj fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1981*38fd1498Szrj fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1982*38fd1498Szrj MIN_INSN_TO_PREFETCH_RATIO);
1983*38fd1498Szrj fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1984*38fd1498Szrj PREFETCH_MIN_INSN_TO_MEM_RATIO);
1985*38fd1498Szrj fprintf (dump_file, "\n");
1986*38fd1498Szrj }
1987*38fd1498Szrj
1988*38fd1498Szrj initialize_original_copy_tables ();
1989*38fd1498Szrj
1990*38fd1498Szrj if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1991*38fd1498Szrj {
1992*38fd1498Szrj tree type = build_function_type_list (void_type_node,
1993*38fd1498Szrj const_ptr_type_node, NULL_TREE);
1994*38fd1498Szrj tree decl = add_builtin_function ("__builtin_prefetch", type,
1995*38fd1498Szrj BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1996*38fd1498Szrj NULL, NULL_TREE);
1997*38fd1498Szrj DECL_IS_NOVOPS (decl) = true;
1998*38fd1498Szrj set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1999*38fd1498Szrj }
2000*38fd1498Szrj
2001*38fd1498Szrj FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
2002*38fd1498Szrj {
2003*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
2004*38fd1498Szrj fprintf (dump_file, "Processing loop %d:\n", loop->num);
2005*38fd1498Szrj
2006*38fd1498Szrj unrolled |= loop_prefetch_arrays (loop);
2007*38fd1498Szrj
2008*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS))
2009*38fd1498Szrj fprintf (dump_file, "\n\n");
2010*38fd1498Szrj }
2011*38fd1498Szrj
2012*38fd1498Szrj if (unrolled)
2013*38fd1498Szrj {
2014*38fd1498Szrj scev_reset ();
2015*38fd1498Szrj todo_flags |= TODO_cleanup_cfg;
2016*38fd1498Szrj }
2017*38fd1498Szrj
2018*38fd1498Szrj free_original_copy_tables ();
2019*38fd1498Szrj return todo_flags;
2020*38fd1498Szrj }
2021*38fd1498Szrj
2022*38fd1498Szrj /* Prefetching. */
2023*38fd1498Szrj
2024*38fd1498Szrj namespace {
2025*38fd1498Szrj
2026*38fd1498Szrj const pass_data pass_data_loop_prefetch =
2027*38fd1498Szrj {
2028*38fd1498Szrj GIMPLE_PASS, /* type */
2029*38fd1498Szrj "aprefetch", /* name */
2030*38fd1498Szrj OPTGROUP_LOOP, /* optinfo_flags */
2031*38fd1498Szrj TV_TREE_PREFETCH, /* tv_id */
2032*38fd1498Szrj ( PROP_cfg | PROP_ssa ), /* properties_required */
2033*38fd1498Szrj 0, /* properties_provided */
2034*38fd1498Szrj 0, /* properties_destroyed */
2035*38fd1498Szrj 0, /* todo_flags_start */
2036*38fd1498Szrj 0, /* todo_flags_finish */
2037*38fd1498Szrj };
2038*38fd1498Szrj
2039*38fd1498Szrj class pass_loop_prefetch : public gimple_opt_pass
2040*38fd1498Szrj {
2041*38fd1498Szrj public:
pass_loop_prefetch(gcc::context * ctxt)2042*38fd1498Szrj pass_loop_prefetch (gcc::context *ctxt)
2043*38fd1498Szrj : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2044*38fd1498Szrj {}
2045*38fd1498Szrj
2046*38fd1498Szrj /* opt_pass methods: */
gate(function *)2047*38fd1498Szrj virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2048*38fd1498Szrj virtual unsigned int execute (function *);
2049*38fd1498Szrj
2050*38fd1498Szrj }; // class pass_loop_prefetch
2051*38fd1498Szrj
2052*38fd1498Szrj unsigned int
execute(function * fun)2053*38fd1498Szrj pass_loop_prefetch::execute (function *fun)
2054*38fd1498Szrj {
2055*38fd1498Szrj if (number_of_loops (fun) <= 1)
2056*38fd1498Szrj return 0;
2057*38fd1498Szrj
2058*38fd1498Szrj if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2059*38fd1498Szrj {
2060*38fd1498Szrj static bool warned = false;
2061*38fd1498Szrj
2062*38fd1498Szrj if (!warned)
2063*38fd1498Szrj {
2064*38fd1498Szrj warning (OPT_Wdisabled_optimization,
2065*38fd1498Szrj "%<l1-cache-size%> parameter is not a power of two %d",
2066*38fd1498Szrj PREFETCH_BLOCK);
2067*38fd1498Szrj warned = true;
2068*38fd1498Szrj }
2069*38fd1498Szrj return 0;
2070*38fd1498Szrj }
2071*38fd1498Szrj
2072*38fd1498Szrj return tree_ssa_prefetch_arrays ();
2073*38fd1498Szrj }
2074*38fd1498Szrj
2075*38fd1498Szrj } // anon namespace
2076*38fd1498Szrj
2077*38fd1498Szrj gimple_opt_pass *
make_pass_loop_prefetch(gcc::context * ctxt)2078*38fd1498Szrj make_pass_loop_prefetch (gcc::context *ctxt)
2079*38fd1498Szrj {
2080*38fd1498Szrj return new pass_loop_prefetch (ctxt);
2081*38fd1498Szrj }
2082*38fd1498Szrj
2083*38fd1498Szrj
2084