xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-ssa-loop-prefetch.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
11debfc3dSmrg /* Array prefetching.
2*8feb0f0bSmrg    Copyright (C) 2005-2020 Free Software Foundation, Inc.
31debfc3dSmrg 
41debfc3dSmrg This file is part of GCC.
51debfc3dSmrg 
61debfc3dSmrg GCC is free software; you can redistribute it and/or modify it
71debfc3dSmrg under the terms of the GNU General Public License as published by the
81debfc3dSmrg Free Software Foundation; either version 3, or (at your option) any
91debfc3dSmrg later version.
101debfc3dSmrg 
111debfc3dSmrg GCC is distributed in the hope that it will be useful, but WITHOUT
121debfc3dSmrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
131debfc3dSmrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
141debfc3dSmrg for more details.
151debfc3dSmrg 
161debfc3dSmrg You should have received a copy of the GNU General Public License
171debfc3dSmrg along with GCC; see the file COPYING3.  If not see
181debfc3dSmrg <http://www.gnu.org/licenses/>.  */
191debfc3dSmrg 
201debfc3dSmrg #include "config.h"
211debfc3dSmrg #include "system.h"
221debfc3dSmrg #include "coretypes.h"
231debfc3dSmrg #include "backend.h"
241debfc3dSmrg #include "target.h"
251debfc3dSmrg #include "rtl.h"
261debfc3dSmrg #include "tree.h"
271debfc3dSmrg #include "gimple.h"
281debfc3dSmrg #include "predict.h"
291debfc3dSmrg #include "tree-pass.h"
301debfc3dSmrg #include "gimple-ssa.h"
311debfc3dSmrg #include "optabs-query.h"
321debfc3dSmrg #include "tree-pretty-print.h"
331debfc3dSmrg #include "fold-const.h"
341debfc3dSmrg #include "stor-layout.h"
351debfc3dSmrg #include "gimplify.h"
361debfc3dSmrg #include "gimple-iterator.h"
371debfc3dSmrg #include "gimplify-me.h"
381debfc3dSmrg #include "tree-ssa-loop-ivopts.h"
391debfc3dSmrg #include "tree-ssa-loop-manip.h"
401debfc3dSmrg #include "tree-ssa-loop-niter.h"
411debfc3dSmrg #include "tree-ssa-loop.h"
421debfc3dSmrg #include "ssa.h"
431debfc3dSmrg #include "tree-into-ssa.h"
441debfc3dSmrg #include "cfgloop.h"
451debfc3dSmrg #include "tree-scalar-evolution.h"
461debfc3dSmrg #include "langhooks.h"
471debfc3dSmrg #include "tree-inline.h"
481debfc3dSmrg #include "tree-data-ref.h"
491debfc3dSmrg #include "diagnostic-core.h"
50a2dc1f3fSmrg #include "dbgcnt.h"
511debfc3dSmrg 
521debfc3dSmrg /* This pass inserts prefetch instructions to optimize cache usage during
531debfc3dSmrg    accesses to arrays in loops.  It processes loops sequentially and:
541debfc3dSmrg 
551debfc3dSmrg    1) Gathers all memory references in the single loop.
561debfc3dSmrg    2) For each of the references it decides when it is profitable to prefetch
571debfc3dSmrg       it.  To do it, we evaluate the reuse among the accesses, and determines
581debfc3dSmrg       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
591debfc3dSmrg       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
601debfc3dSmrg       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
611debfc3dSmrg       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
621debfc3dSmrg       (assuming cache line size is 64 bytes, char has size 1 byte and there
631debfc3dSmrg       is no hardware sequential prefetch):
641debfc3dSmrg 
651debfc3dSmrg       char *a;
661debfc3dSmrg       for (i = 0; i < max; i++)
671debfc3dSmrg 	{
681debfc3dSmrg 	  a[255] = ...;		(0)
691debfc3dSmrg 	  a[i] = ...;		(1)
701debfc3dSmrg 	  a[i + 64] = ...;	(2)
711debfc3dSmrg 	  a[16*i] = ...;	(3)
721debfc3dSmrg 	  a[187*i] = ...;	(4)
731debfc3dSmrg 	  a[187*i + 50] = ...;	(5)
741debfc3dSmrg 	}
751debfc3dSmrg 
761debfc3dSmrg        (0) obviously has PREFETCH_BEFORE 1
771debfc3dSmrg        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
781debfc3dSmrg            location 64 iterations before it, and PREFETCH_MOD 64 (since
791debfc3dSmrg 	   it hits the same cache line otherwise).
801debfc3dSmrg        (2) has PREFETCH_MOD 64
811debfc3dSmrg        (3) has PREFETCH_MOD 4
821debfc3dSmrg        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
831debfc3dSmrg            the cache line accessed by (5) is the same with probability only
841debfc3dSmrg 	   7/32.
851debfc3dSmrg        (5) has PREFETCH_MOD 1 as well.
861debfc3dSmrg 
871debfc3dSmrg       Additionally, we use data dependence analysis to determine for each
881debfc3dSmrg       reference the distance till the first reuse; this information is used
891debfc3dSmrg       to determine the temporality of the issued prefetch instruction.
901debfc3dSmrg 
911debfc3dSmrg    3) We determine how much ahead we need to prefetch.  The number of
921debfc3dSmrg       iterations needed is time to fetch / time spent in one iteration of
931debfc3dSmrg       the loop.  The problem is that we do not know either of these values,
941debfc3dSmrg       so we just make a heuristic guess based on a magic (possibly)
951debfc3dSmrg       target-specific constant and size of the loop.
961debfc3dSmrg 
971debfc3dSmrg    4) Determine which of the references we prefetch.  We take into account
981debfc3dSmrg       that there is a maximum number of simultaneous prefetches (provided
991debfc3dSmrg       by machine description).  We prefetch as many prefetches as possible
1001debfc3dSmrg       while still within this bound (starting with those with lowest
1011debfc3dSmrg       prefetch_mod, since they are responsible for most of the cache
1021debfc3dSmrg       misses).
1031debfc3dSmrg 
1041debfc3dSmrg    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
1051debfc3dSmrg       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
1061debfc3dSmrg       prefetching nonaccessed memory.
1071debfc3dSmrg       TODO -- actually implement peeling.
1081debfc3dSmrg 
1091debfc3dSmrg    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
1101debfc3dSmrg       prefetch instructions with guards in cases where 5) was not sufficient
1111debfc3dSmrg       to satisfy the constraints?
1121debfc3dSmrg 
1131debfc3dSmrg    A cost model is implemented to determine whether or not prefetching is
1141debfc3dSmrg    profitable for a given loop.  The cost model has three heuristics:
1151debfc3dSmrg 
1161debfc3dSmrg    1. Function trip_count_to_ahead_ratio_too_small_p implements a
1171debfc3dSmrg       heuristic that determines whether or not the loop has too few
1181debfc3dSmrg       iterations (compared to ahead).  Prefetching is not likely to be
1191debfc3dSmrg       beneficial if the trip count to ahead ratio is below a certain
1201debfc3dSmrg       minimum.
1211debfc3dSmrg 
1221debfc3dSmrg    2. Function mem_ref_count_reasonable_p implements a heuristic that
1231debfc3dSmrg       determines whether the given loop has enough CPU ops that can be
1241debfc3dSmrg       overlapped with cache missing memory ops.  If not, the loop
1251debfc3dSmrg       won't benefit from prefetching.  In the implementation,
1261debfc3dSmrg       prefetching is not considered beneficial if the ratio between
1271debfc3dSmrg       the instruction count and the mem ref count is below a certain
1281debfc3dSmrg       minimum.
1291debfc3dSmrg 
1301debfc3dSmrg    3. Function insn_to_prefetch_ratio_too_small_p implements a
1311debfc3dSmrg       heuristic that disables prefetching in a loop if the prefetching
1321debfc3dSmrg       cost is above a certain limit.  The relative prefetching cost is
1331debfc3dSmrg       estimated by taking the ratio between the prefetch count and the
1341debfc3dSmrg       total intruction count (this models the I-cache cost).
1351debfc3dSmrg 
1361debfc3dSmrg    The limits used in these heuristics are defined as parameters with
1371debfc3dSmrg    reasonable default values. Machine-specific default values will be
1381debfc3dSmrg    added later.
1391debfc3dSmrg 
1401debfc3dSmrg    Some other TODO:
1411debfc3dSmrg       -- write and use more general reuse analysis (that could be also used
1421debfc3dSmrg 	 in other cache aimed loop optimizations)
1431debfc3dSmrg       -- make it behave sanely together with the prefetches given by user
1441debfc3dSmrg 	 (now we just ignore them; at the very least we should avoid
1451debfc3dSmrg 	 optimizing loops in that user put his own prefetches)
1461debfc3dSmrg       -- we assume cache line size alignment of arrays; this could be
1471debfc3dSmrg 	 improved.  */
1481debfc3dSmrg 
1491debfc3dSmrg /* Magic constants follow.  These should be replaced by machine specific
1501debfc3dSmrg    numbers.  */
1511debfc3dSmrg 
1521debfc3dSmrg /* True if write can be prefetched by a read prefetch.  */
1531debfc3dSmrg 
1541debfc3dSmrg #ifndef WRITE_CAN_USE_READ_PREFETCH
1551debfc3dSmrg #define WRITE_CAN_USE_READ_PREFETCH 1
1561debfc3dSmrg #endif
1571debfc3dSmrg 
1581debfc3dSmrg /* True if read can be prefetched by a write prefetch. */
1591debfc3dSmrg 
1601debfc3dSmrg #ifndef READ_CAN_USE_WRITE_PREFETCH
1611debfc3dSmrg #define READ_CAN_USE_WRITE_PREFETCH 0
1621debfc3dSmrg #endif
1631debfc3dSmrg 
1641debfc3dSmrg /* The size of the block loaded by a single prefetch.  Usually, this is
1651debfc3dSmrg    the same as cache line size (at the moment, we only consider one level
1661debfc3dSmrg    of cache hierarchy).  */
1671debfc3dSmrg 
1681debfc3dSmrg #ifndef PREFETCH_BLOCK
169*8feb0f0bSmrg #define PREFETCH_BLOCK param_l1_cache_line_size
1701debfc3dSmrg #endif
1711debfc3dSmrg 
1721debfc3dSmrg /* Do we have a forward hardware sequential prefetching?  */
1731debfc3dSmrg 
1741debfc3dSmrg #ifndef HAVE_FORWARD_PREFETCH
1751debfc3dSmrg #define HAVE_FORWARD_PREFETCH 0
1761debfc3dSmrg #endif
1771debfc3dSmrg 
1781debfc3dSmrg /* Do we have a backward hardware sequential prefetching?  */
1791debfc3dSmrg 
1801debfc3dSmrg #ifndef HAVE_BACKWARD_PREFETCH
1811debfc3dSmrg #define HAVE_BACKWARD_PREFETCH 0
1821debfc3dSmrg #endif
1831debfc3dSmrg 
1841debfc3dSmrg /* In some cases we are only able to determine that there is a certain
1851debfc3dSmrg    probability that the two accesses hit the same cache line.  In this
1861debfc3dSmrg    case, we issue the prefetches for both of them if this probability
1871debfc3dSmrg    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
1881debfc3dSmrg 
1891debfc3dSmrg #ifndef ACCEPTABLE_MISS_RATE
1901debfc3dSmrg #define ACCEPTABLE_MISS_RATE 50
1911debfc3dSmrg #endif
1921debfc3dSmrg 
193*8feb0f0bSmrg #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
194*8feb0f0bSmrg #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
1951debfc3dSmrg 
1961debfc3dSmrg /* We consider a memory access nontemporal if it is not reused sooner than
1971debfc3dSmrg    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
1981debfc3dSmrg    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1991debfc3dSmrg    so that we use nontemporal prefetches e.g. if single memory location
2001debfc3dSmrg    is accessed several times in a single iteration of the loop.  */
2011debfc3dSmrg #define NONTEMPORAL_FRACTION 16
2021debfc3dSmrg 
2031debfc3dSmrg /* In case we have to emit a memory fence instruction after the loop that
2041debfc3dSmrg    uses nontemporal stores, this defines the builtin to use.  */
2051debfc3dSmrg 
2061debfc3dSmrg #ifndef FENCE_FOLLOWING_MOVNT
2071debfc3dSmrg #define FENCE_FOLLOWING_MOVNT NULL_TREE
2081debfc3dSmrg #endif
2091debfc3dSmrg 
2101debfc3dSmrg /* It is not profitable to prefetch when the trip count is not at
2111debfc3dSmrg    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
2121debfc3dSmrg    For example, in a loop with a prefetch ahead distance of 10,
2131debfc3dSmrg    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
2141debfc3dSmrg    profitable to prefetch when the trip count is greater or equal to
2151debfc3dSmrg    40.  In that case, 30 out of the 40 iterations will benefit from
2161debfc3dSmrg    prefetching.  */
2171debfc3dSmrg 
2181debfc3dSmrg #ifndef TRIP_COUNT_TO_AHEAD_RATIO
2191debfc3dSmrg #define TRIP_COUNT_TO_AHEAD_RATIO 4
2201debfc3dSmrg #endif
2211debfc3dSmrg 
2221debfc3dSmrg /* The group of references between that reuse may occur.  */
2231debfc3dSmrg 
2241debfc3dSmrg struct mem_ref_group
2251debfc3dSmrg {
2261debfc3dSmrg   tree base;			/* Base of the reference.  */
2271debfc3dSmrg   tree step;			/* Step of the reference.  */
2281debfc3dSmrg   struct mem_ref *refs;		/* References in the group.  */
2291debfc3dSmrg   struct mem_ref_group *next;	/* Next group of references.  */
230a2dc1f3fSmrg   unsigned int uid;		/* Group UID, used only for debugging.  */
2311debfc3dSmrg };
2321debfc3dSmrg 
2331debfc3dSmrg /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
2341debfc3dSmrg 
2351debfc3dSmrg #define PREFETCH_ALL		HOST_WIDE_INT_M1U
2361debfc3dSmrg 
2371debfc3dSmrg /* Do not generate a prefetch if the unroll factor is significantly less
2381debfc3dSmrg    than what is required by the prefetch.  This is to avoid redundant
2391debfc3dSmrg    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
2401debfc3dSmrg    2, prefetching requires unrolling the loop 16 times, but
2411debfc3dSmrg    the loop is actually unrolled twice.  In this case (ratio = 8),
2421debfc3dSmrg    prefetching is not likely to be beneficial.  */
2431debfc3dSmrg 
2441debfc3dSmrg #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
2451debfc3dSmrg #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
2461debfc3dSmrg #endif
2471debfc3dSmrg 
2481debfc3dSmrg /* Some of the prefetch computations have quadratic complexity.  We want to
2491debfc3dSmrg    avoid huge compile times and, therefore, want to limit the amount of
2501debfc3dSmrg    memory references per loop where we consider prefetching.  */
2511debfc3dSmrg 
2521debfc3dSmrg #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
2531debfc3dSmrg #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
2541debfc3dSmrg #endif
2551debfc3dSmrg 
2561debfc3dSmrg /* The memory reference.  */
2571debfc3dSmrg 
2581debfc3dSmrg struct mem_ref
2591debfc3dSmrg {
2601debfc3dSmrg   gimple *stmt;			/* Statement in that the reference appears.  */
2611debfc3dSmrg   tree mem;			/* The reference.  */
2621debfc3dSmrg   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
2631debfc3dSmrg   struct mem_ref_group *group;	/* The group of references it belongs to.  */
2641debfc3dSmrg   unsigned HOST_WIDE_INT prefetch_mod;
2651debfc3dSmrg 				/* Prefetch only each PREFETCH_MOD-th
2661debfc3dSmrg 				   iteration.  */
2671debfc3dSmrg   unsigned HOST_WIDE_INT prefetch_before;
2681debfc3dSmrg 				/* Prefetch only first PREFETCH_BEFORE
2691debfc3dSmrg 				   iterations.  */
2701debfc3dSmrg   unsigned reuse_distance;	/* The amount of data accessed before the first
2711debfc3dSmrg 				   reuse of this value.  */
2721debfc3dSmrg   struct mem_ref *next;		/* The next reference in the group.  */
273a2dc1f3fSmrg   unsigned int uid;		/* Ref UID, used only for debugging.  */
2741debfc3dSmrg   unsigned write_p : 1;		/* Is it a write?  */
2751debfc3dSmrg   unsigned independent_p : 1;	/* True if the reference is independent on
2761debfc3dSmrg 				   all other references inside the loop.  */
2771debfc3dSmrg   unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
2781debfc3dSmrg   unsigned storent_p : 1;	/* True if we changed the store to a
2791debfc3dSmrg 				   nontemporal one.  */
2801debfc3dSmrg };
2811debfc3dSmrg 
2821debfc3dSmrg /* Dumps information about memory reference */
2831debfc3dSmrg static void
dump_mem_details(FILE * file,tree base,tree step,HOST_WIDE_INT delta,bool write_p)2841debfc3dSmrg dump_mem_details (FILE *file, tree base, tree step,
2851debfc3dSmrg 	    HOST_WIDE_INT delta, bool write_p)
2861debfc3dSmrg {
2871debfc3dSmrg   fprintf (file, "(base ");
2881debfc3dSmrg   print_generic_expr (file, base, TDF_SLIM);
2891debfc3dSmrg   fprintf (file, ", step ");
2901debfc3dSmrg   if (cst_and_fits_in_hwi (step))
2911debfc3dSmrg     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
2921debfc3dSmrg   else
293a2dc1f3fSmrg     print_generic_expr (file, step, TDF_SLIM);
2941debfc3dSmrg   fprintf (file, ")\n");
295a2dc1f3fSmrg   fprintf (file, "  delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
296a2dc1f3fSmrg   fprintf (file, "  %s\n\n", write_p ? "write" : "read");
2971debfc3dSmrg }
2981debfc3dSmrg 
2991debfc3dSmrg /* Dumps information about reference REF to FILE.  */
3001debfc3dSmrg 
3011debfc3dSmrg static void
dump_mem_ref(FILE * file,struct mem_ref * ref)3021debfc3dSmrg dump_mem_ref (FILE *file, struct mem_ref *ref)
3031debfc3dSmrg {
304a2dc1f3fSmrg   fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
305a2dc1f3fSmrg   print_generic_expr (file, ref->mem, TDF_SLIM);
306a2dc1f3fSmrg   fprintf (file, ")\n");
3071debfc3dSmrg }
3081debfc3dSmrg 
3091debfc3dSmrg /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
3101debfc3dSmrg    exist.  */
3111debfc3dSmrg 
3121debfc3dSmrg static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)3131debfc3dSmrg find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
3141debfc3dSmrg {
315a2dc1f3fSmrg   /* Global count for setting struct mem_ref_group->uid.  */
316a2dc1f3fSmrg   static unsigned int last_mem_ref_group_uid = 0;
317a2dc1f3fSmrg 
3181debfc3dSmrg   struct mem_ref_group *group;
3191debfc3dSmrg 
3201debfc3dSmrg   for (; *groups; groups = &(*groups)->next)
3211debfc3dSmrg     {
3221debfc3dSmrg       if (operand_equal_p ((*groups)->step, step, 0)
3231debfc3dSmrg 	  && operand_equal_p ((*groups)->base, base, 0))
3241debfc3dSmrg 	return *groups;
3251debfc3dSmrg 
3261debfc3dSmrg       /* If step is an integer constant, keep the list of groups sorted
3271debfc3dSmrg          by decreasing step.  */
3281debfc3dSmrg       if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
3291debfc3dSmrg 	  && int_cst_value ((*groups)->step) < int_cst_value (step))
3301debfc3dSmrg 	break;
3311debfc3dSmrg     }
3321debfc3dSmrg 
3331debfc3dSmrg   group = XNEW (struct mem_ref_group);
3341debfc3dSmrg   group->base = base;
3351debfc3dSmrg   group->step = step;
3361debfc3dSmrg   group->refs = NULL;
337a2dc1f3fSmrg   group->uid = ++last_mem_ref_group_uid;
3381debfc3dSmrg   group->next = *groups;
3391debfc3dSmrg   *groups = group;
3401debfc3dSmrg 
3411debfc3dSmrg   return group;
3421debfc3dSmrg }
3431debfc3dSmrg 
3441debfc3dSmrg /* Records a memory reference MEM in GROUP with offset DELTA and write status
3451debfc3dSmrg    WRITE_P.  The reference occurs in statement STMT.  */
3461debfc3dSmrg 
3471debfc3dSmrg static void
record_ref(struct mem_ref_group * group,gimple * stmt,tree mem,HOST_WIDE_INT delta,bool write_p)3481debfc3dSmrg record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
3491debfc3dSmrg 	    HOST_WIDE_INT delta, bool write_p)
3501debfc3dSmrg {
351a2dc1f3fSmrg   unsigned int last_mem_ref_uid = 0;
3521debfc3dSmrg   struct mem_ref **aref;
3531debfc3dSmrg 
3541debfc3dSmrg   /* Do not record the same address twice.  */
3551debfc3dSmrg   for (aref = &group->refs; *aref; aref = &(*aref)->next)
3561debfc3dSmrg     {
357a2dc1f3fSmrg       last_mem_ref_uid = (*aref)->uid;
358a2dc1f3fSmrg 
3591debfc3dSmrg       /* It does not have to be possible for write reference to reuse the read
3601debfc3dSmrg 	 prefetch, or vice versa.  */
3611debfc3dSmrg       if (!WRITE_CAN_USE_READ_PREFETCH
3621debfc3dSmrg 	  && write_p
3631debfc3dSmrg 	  && !(*aref)->write_p)
3641debfc3dSmrg 	continue;
3651debfc3dSmrg       if (!READ_CAN_USE_WRITE_PREFETCH
3661debfc3dSmrg 	  && !write_p
3671debfc3dSmrg 	  && (*aref)->write_p)
3681debfc3dSmrg 	continue;
3691debfc3dSmrg 
3701debfc3dSmrg       if ((*aref)->delta == delta)
3711debfc3dSmrg 	return;
3721debfc3dSmrg     }
3731debfc3dSmrg 
3741debfc3dSmrg   (*aref) = XNEW (struct mem_ref);
3751debfc3dSmrg   (*aref)->stmt = stmt;
3761debfc3dSmrg   (*aref)->mem = mem;
3771debfc3dSmrg   (*aref)->delta = delta;
3781debfc3dSmrg   (*aref)->write_p = write_p;
3791debfc3dSmrg   (*aref)->prefetch_before = PREFETCH_ALL;
3801debfc3dSmrg   (*aref)->prefetch_mod = 1;
3811debfc3dSmrg   (*aref)->reuse_distance = 0;
3821debfc3dSmrg   (*aref)->issue_prefetch_p = false;
3831debfc3dSmrg   (*aref)->group = group;
3841debfc3dSmrg   (*aref)->next = NULL;
3851debfc3dSmrg   (*aref)->independent_p = false;
3861debfc3dSmrg   (*aref)->storent_p = false;
387a2dc1f3fSmrg   (*aref)->uid = last_mem_ref_uid + 1;
3881debfc3dSmrg 
3891debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
390a2dc1f3fSmrg     {
3911debfc3dSmrg       dump_mem_ref (dump_file, *aref);
392a2dc1f3fSmrg 
393a2dc1f3fSmrg       fprintf (dump_file, "  group %u ", group->uid);
394a2dc1f3fSmrg       dump_mem_details (dump_file, group->base, group->step, delta,
395a2dc1f3fSmrg 			write_p);
396a2dc1f3fSmrg     }
3971debfc3dSmrg }
3981debfc3dSmrg 
3991debfc3dSmrg /* Release memory references in GROUPS.  */
4001debfc3dSmrg 
4011debfc3dSmrg static void
release_mem_refs(struct mem_ref_group * groups)4021debfc3dSmrg release_mem_refs (struct mem_ref_group *groups)
4031debfc3dSmrg {
4041debfc3dSmrg   struct mem_ref_group *next_g;
4051debfc3dSmrg   struct mem_ref *ref, *next_r;
4061debfc3dSmrg 
4071debfc3dSmrg   for (; groups; groups = next_g)
4081debfc3dSmrg     {
4091debfc3dSmrg       next_g = groups->next;
4101debfc3dSmrg       for (ref = groups->refs; ref; ref = next_r)
4111debfc3dSmrg 	{
4121debfc3dSmrg 	  next_r = ref->next;
4131debfc3dSmrg 	  free (ref);
4141debfc3dSmrg 	}
4151debfc3dSmrg       free (groups);
4161debfc3dSmrg     }
4171debfc3dSmrg }
4181debfc3dSmrg 
4191debfc3dSmrg /* A structure used to pass arguments to idx_analyze_ref.  */
4201debfc3dSmrg 
4211debfc3dSmrg struct ar_data
4221debfc3dSmrg {
423*8feb0f0bSmrg   class loop *loop;			/* Loop of the reference.  */
4241debfc3dSmrg   gimple *stmt;				/* Statement of the reference.  */
4251debfc3dSmrg   tree *step;				/* Step of the memory reference.  */
4261debfc3dSmrg   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
4271debfc3dSmrg };
4281debfc3dSmrg 
4291debfc3dSmrg /* Analyzes a single INDEX of a memory reference to obtain information
4301debfc3dSmrg    described at analyze_ref.  Callback for for_each_index.  */
4311debfc3dSmrg 
4321debfc3dSmrg static bool
idx_analyze_ref(tree base,tree * index,void * data)4331debfc3dSmrg idx_analyze_ref (tree base, tree *index, void *data)
4341debfc3dSmrg {
4351debfc3dSmrg   struct ar_data *ar_data = (struct ar_data *) data;
4361debfc3dSmrg   tree ibase, step, stepsize;
4371debfc3dSmrg   HOST_WIDE_INT idelta = 0, imult = 1;
4381debfc3dSmrg   affine_iv iv;
4391debfc3dSmrg 
4401debfc3dSmrg   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
4411debfc3dSmrg 		  *index, &iv, true))
4421debfc3dSmrg     return false;
4431debfc3dSmrg   ibase = iv.base;
4441debfc3dSmrg   step = iv.step;
4451debfc3dSmrg 
4461debfc3dSmrg   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
4471debfc3dSmrg       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
4481debfc3dSmrg     {
4491debfc3dSmrg       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
4501debfc3dSmrg       ibase = TREE_OPERAND (ibase, 0);
4511debfc3dSmrg     }
4521debfc3dSmrg   if (cst_and_fits_in_hwi (ibase))
4531debfc3dSmrg     {
4541debfc3dSmrg       idelta += int_cst_value (ibase);
4551debfc3dSmrg       ibase = build_int_cst (TREE_TYPE (ibase), 0);
4561debfc3dSmrg     }
4571debfc3dSmrg 
4581debfc3dSmrg   if (TREE_CODE (base) == ARRAY_REF)
4591debfc3dSmrg     {
4601debfc3dSmrg       stepsize = array_ref_element_size (base);
4611debfc3dSmrg       if (!cst_and_fits_in_hwi (stepsize))
4621debfc3dSmrg 	return false;
4631debfc3dSmrg       imult = int_cst_value (stepsize);
4641debfc3dSmrg       step = fold_build2 (MULT_EXPR, sizetype,
4651debfc3dSmrg 			  fold_convert (sizetype, step),
4661debfc3dSmrg 			  fold_convert (sizetype, stepsize));
4671debfc3dSmrg       idelta *= imult;
4681debfc3dSmrg     }
4691debfc3dSmrg 
4701debfc3dSmrg   if (*ar_data->step == NULL_TREE)
4711debfc3dSmrg     *ar_data->step = step;
4721debfc3dSmrg   else
4731debfc3dSmrg     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
4741debfc3dSmrg 				  fold_convert (sizetype, *ar_data->step),
4751debfc3dSmrg 				  fold_convert (sizetype, step));
4761debfc3dSmrg   *ar_data->delta += idelta;
4771debfc3dSmrg   *index = ibase;
4781debfc3dSmrg 
4791debfc3dSmrg   return true;
4801debfc3dSmrg }
4811debfc3dSmrg 
4821debfc3dSmrg /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
4831debfc3dSmrg    STEP are integer constants and iter is number of iterations of LOOP.  The
4841debfc3dSmrg    reference occurs in statement STMT.  Strips nonaddressable component
4851debfc3dSmrg    references from REF_P.  */
4861debfc3dSmrg 
4871debfc3dSmrg static bool
analyze_ref(class loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple * stmt)488*8feb0f0bSmrg analyze_ref (class loop *loop, tree *ref_p, tree *base,
4891debfc3dSmrg 	     tree *step, HOST_WIDE_INT *delta,
4901debfc3dSmrg 	     gimple *stmt)
4911debfc3dSmrg {
4921debfc3dSmrg   struct ar_data ar_data;
4931debfc3dSmrg   tree off;
4941debfc3dSmrg   HOST_WIDE_INT bit_offset;
4951debfc3dSmrg   tree ref = *ref_p;
4961debfc3dSmrg 
4971debfc3dSmrg   *step = NULL_TREE;
4981debfc3dSmrg   *delta = 0;
4991debfc3dSmrg 
5001debfc3dSmrg   /* First strip off the component references.  Ignore bitfields.
5011debfc3dSmrg      Also strip off the real and imagine parts of a complex, so that
5021debfc3dSmrg      they can have the same base.  */
5031debfc3dSmrg   if (TREE_CODE (ref) == REALPART_EXPR
5041debfc3dSmrg       || TREE_CODE (ref) == IMAGPART_EXPR
5051debfc3dSmrg       || (TREE_CODE (ref) == COMPONENT_REF
5061debfc3dSmrg           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
5071debfc3dSmrg     {
5081debfc3dSmrg       if (TREE_CODE (ref) == IMAGPART_EXPR)
5091debfc3dSmrg         *delta += int_size_in_bytes (TREE_TYPE (ref));
5101debfc3dSmrg       ref = TREE_OPERAND (ref, 0);
5111debfc3dSmrg     }
5121debfc3dSmrg 
5131debfc3dSmrg   *ref_p = ref;
5141debfc3dSmrg 
5151debfc3dSmrg   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
5161debfc3dSmrg     {
5171debfc3dSmrg       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
5181debfc3dSmrg       bit_offset = TREE_INT_CST_LOW (off);
5191debfc3dSmrg       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
5201debfc3dSmrg 
5211debfc3dSmrg       *delta += bit_offset / BITS_PER_UNIT;
5221debfc3dSmrg     }
5231debfc3dSmrg 
5241debfc3dSmrg   *base = unshare_expr (ref);
5251debfc3dSmrg   ar_data.loop = loop;
5261debfc3dSmrg   ar_data.stmt = stmt;
5271debfc3dSmrg   ar_data.step = step;
5281debfc3dSmrg   ar_data.delta = delta;
5291debfc3dSmrg   return for_each_index (base, idx_analyze_ref, &ar_data);
5301debfc3dSmrg }
5311debfc3dSmrg 
5321debfc3dSmrg /* Record a memory reference REF to the list REFS.  The reference occurs in
5331debfc3dSmrg    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
5341debfc3dSmrg    reference was recorded, false otherwise.  */
5351debfc3dSmrg 
5361debfc3dSmrg static bool
gather_memory_references_ref(class loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple * stmt)537*8feb0f0bSmrg gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs,
5381debfc3dSmrg 			      tree ref, bool write_p, gimple *stmt)
5391debfc3dSmrg {
5401debfc3dSmrg   tree base, step;
5411debfc3dSmrg   HOST_WIDE_INT delta;
5421debfc3dSmrg   struct mem_ref_group *agrp;
5431debfc3dSmrg 
5441debfc3dSmrg   if (get_base_address (ref) == NULL)
5451debfc3dSmrg     return false;
5461debfc3dSmrg 
5471debfc3dSmrg   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
5481debfc3dSmrg     return false;
5491debfc3dSmrg   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
5501debfc3dSmrg   if (step == NULL_TREE)
5511debfc3dSmrg     return false;
5521debfc3dSmrg 
5531debfc3dSmrg   /* Stop if the address of BASE could not be taken.  */
5541debfc3dSmrg   if (may_be_nonaddressable_p (base))
5551debfc3dSmrg     return false;
5561debfc3dSmrg 
5571debfc3dSmrg   /* Limit non-constant step prefetching only to the innermost loops and
5581debfc3dSmrg      only when the step is loop invariant in the entire loop nest. */
5591debfc3dSmrg   if (!cst_and_fits_in_hwi (step))
5601debfc3dSmrg     {
5611debfc3dSmrg       if (loop->inner != NULL)
5621debfc3dSmrg         {
5631debfc3dSmrg           if (dump_file && (dump_flags & TDF_DETAILS))
5641debfc3dSmrg             {
5651debfc3dSmrg               fprintf (dump_file, "Memory expression %p\n",(void *) ref );
566a2dc1f3fSmrg 	      print_generic_expr (dump_file, ref, TDF_SLIM);
5671debfc3dSmrg 	      fprintf (dump_file,":");
5681debfc3dSmrg               dump_mem_details (dump_file, base, step, delta, write_p);
5691debfc3dSmrg               fprintf (dump_file,
5701debfc3dSmrg                        "Ignoring %p, non-constant step prefetching is "
5711debfc3dSmrg                        "limited to inner most loops \n",
5721debfc3dSmrg                        (void *) ref);
5731debfc3dSmrg             }
5741debfc3dSmrg             return false;
5751debfc3dSmrg          }
5761debfc3dSmrg       else
5771debfc3dSmrg         {
5781debfc3dSmrg           if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
5791debfc3dSmrg           {
5801debfc3dSmrg             if (dump_file && (dump_flags & TDF_DETAILS))
5811debfc3dSmrg               {
5821debfc3dSmrg                 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
583a2dc1f3fSmrg 		print_generic_expr (dump_file, ref, TDF_SLIM);
5841debfc3dSmrg                 fprintf (dump_file,":");
5851debfc3dSmrg                 dump_mem_details (dump_file, base, step, delta, write_p);
5861debfc3dSmrg                 fprintf (dump_file,
5871debfc3dSmrg                          "Not prefetching, ignoring %p due to "
5881debfc3dSmrg                          "loop variant step\n",
5891debfc3dSmrg                          (void *) ref);
5901debfc3dSmrg               }
5911debfc3dSmrg               return false;
5921debfc3dSmrg             }
5931debfc3dSmrg         }
5941debfc3dSmrg     }
5951debfc3dSmrg 
5961debfc3dSmrg   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
5971debfc3dSmrg      are integer constants.  */
5981debfc3dSmrg   agrp = find_or_create_group (refs, base, step);
5991debfc3dSmrg   record_ref (agrp, stmt, ref, delta, write_p);
6001debfc3dSmrg 
6011debfc3dSmrg   return true;
6021debfc3dSmrg }
6031debfc3dSmrg 
6041debfc3dSmrg /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
6051debfc3dSmrg    true if there are no other memory references inside the loop.  */
6061debfc3dSmrg 
6071debfc3dSmrg static struct mem_ref_group *
gather_memory_references(class loop * loop,bool * no_other_refs,unsigned * ref_count)608*8feb0f0bSmrg gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count)
6091debfc3dSmrg {
6101debfc3dSmrg   basic_block *body = get_loop_body_in_dom_order (loop);
6111debfc3dSmrg   basic_block bb;
6121debfc3dSmrg   unsigned i;
6131debfc3dSmrg   gimple_stmt_iterator bsi;
6141debfc3dSmrg   gimple *stmt;
6151debfc3dSmrg   tree lhs, rhs;
6161debfc3dSmrg   struct mem_ref_group *refs = NULL;
6171debfc3dSmrg 
6181debfc3dSmrg   *no_other_refs = true;
6191debfc3dSmrg   *ref_count = 0;
6201debfc3dSmrg 
6211debfc3dSmrg   /* Scan the loop body in order, so that the former references precede the
6221debfc3dSmrg      later ones.  */
6231debfc3dSmrg   for (i = 0; i < loop->num_nodes; i++)
6241debfc3dSmrg     {
6251debfc3dSmrg       bb = body[i];
6261debfc3dSmrg       if (bb->loop_father != loop)
6271debfc3dSmrg 	continue;
6281debfc3dSmrg 
6291debfc3dSmrg       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
6301debfc3dSmrg 	{
6311debfc3dSmrg 	  stmt = gsi_stmt (bsi);
6321debfc3dSmrg 
6331debfc3dSmrg 	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
6341debfc3dSmrg 	    {
6351debfc3dSmrg 	      if (gimple_vuse (stmt)
6361debfc3dSmrg 		  || (is_gimple_call (stmt)
6371debfc3dSmrg 		      && !(gimple_call_flags (stmt) & ECF_CONST)))
6381debfc3dSmrg 		*no_other_refs = false;
6391debfc3dSmrg 	      continue;
6401debfc3dSmrg 	    }
6411debfc3dSmrg 
6421debfc3dSmrg 	  if (! gimple_vuse (stmt))
6431debfc3dSmrg 	    continue;
6441debfc3dSmrg 
6451debfc3dSmrg 	  lhs = gimple_assign_lhs (stmt);
6461debfc3dSmrg 	  rhs = gimple_assign_rhs1 (stmt);
6471debfc3dSmrg 
6481debfc3dSmrg 	  if (REFERENCE_CLASS_P (rhs))
6491debfc3dSmrg 	    {
6501debfc3dSmrg 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
6511debfc3dSmrg 							    rhs, false, stmt);
6521debfc3dSmrg 	    *ref_count += 1;
6531debfc3dSmrg 	    }
6541debfc3dSmrg 	  if (REFERENCE_CLASS_P (lhs))
6551debfc3dSmrg 	    {
6561debfc3dSmrg 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
6571debfc3dSmrg 							    lhs, true, stmt);
6581debfc3dSmrg 	    *ref_count += 1;
6591debfc3dSmrg 	    }
6601debfc3dSmrg 	}
6611debfc3dSmrg     }
6621debfc3dSmrg   free (body);
6631debfc3dSmrg 
6641debfc3dSmrg   return refs;
6651debfc3dSmrg }
6661debfc3dSmrg 
6671debfc3dSmrg /* Prune the prefetch candidate REF using the self-reuse.  */
6681debfc3dSmrg 
6691debfc3dSmrg static void
prune_ref_by_self_reuse(struct mem_ref * ref)6701debfc3dSmrg prune_ref_by_self_reuse (struct mem_ref *ref)
6711debfc3dSmrg {
6721debfc3dSmrg   HOST_WIDE_INT step;
6731debfc3dSmrg   bool backward;
6741debfc3dSmrg 
6751debfc3dSmrg   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
6761debfc3dSmrg   if (!cst_and_fits_in_hwi (ref->group->step))
6771debfc3dSmrg     return;
6781debfc3dSmrg 
6791debfc3dSmrg   step = int_cst_value (ref->group->step);
6801debfc3dSmrg 
6811debfc3dSmrg   backward = step < 0;
6821debfc3dSmrg 
6831debfc3dSmrg   if (step == 0)
6841debfc3dSmrg     {
6851debfc3dSmrg       /* Prefetch references to invariant address just once.  */
6861debfc3dSmrg       ref->prefetch_before = 1;
6871debfc3dSmrg       return;
6881debfc3dSmrg     }
6891debfc3dSmrg 
6901debfc3dSmrg   if (backward)
6911debfc3dSmrg     step = -step;
6921debfc3dSmrg 
6931debfc3dSmrg   if (step > PREFETCH_BLOCK)
6941debfc3dSmrg     return;
6951debfc3dSmrg 
6961debfc3dSmrg   if ((backward && HAVE_BACKWARD_PREFETCH)
6971debfc3dSmrg       || (!backward && HAVE_FORWARD_PREFETCH))
6981debfc3dSmrg     {
6991debfc3dSmrg       ref->prefetch_before = 1;
7001debfc3dSmrg       return;
7011debfc3dSmrg     }
7021debfc3dSmrg 
7031debfc3dSmrg   ref->prefetch_mod = PREFETCH_BLOCK / step;
7041debfc3dSmrg }
7051debfc3dSmrg 
7061debfc3dSmrg /* Divides X by BY, rounding down.  */
7071debfc3dSmrg 
7081debfc3dSmrg static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)7091debfc3dSmrg ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
7101debfc3dSmrg {
7111debfc3dSmrg   gcc_assert (by > 0);
7121debfc3dSmrg 
7131debfc3dSmrg   if (x >= 0)
7141debfc3dSmrg     return x / (HOST_WIDE_INT) by;
7151debfc3dSmrg   else
7161debfc3dSmrg     return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
7171debfc3dSmrg }
7181debfc3dSmrg 
7191debfc3dSmrg /* Given a CACHE_LINE_SIZE and two inductive memory references
7201debfc3dSmrg    with a common STEP greater than CACHE_LINE_SIZE and an address
7211debfc3dSmrg    difference DELTA, compute the probability that they will fall
7221debfc3dSmrg    in different cache lines.  Return true if the computed miss rate
7231debfc3dSmrg    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
7241debfc3dSmrg    number of distinct iterations after which the pattern repeats itself.
7251debfc3dSmrg    ALIGN_UNIT is the unit of alignment in bytes.  */
7261debfc3dSmrg 
7271debfc3dSmrg static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)7281debfc3dSmrg is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
7291debfc3dSmrg 		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
7301debfc3dSmrg 		   unsigned HOST_WIDE_INT distinct_iters,
7311debfc3dSmrg 		   int align_unit)
7321debfc3dSmrg {
7331debfc3dSmrg   unsigned align, iter;
7341debfc3dSmrg   int total_positions, miss_positions, max_allowed_miss_positions;
7351debfc3dSmrg   int address1, address2, cache_line1, cache_line2;
7361debfc3dSmrg 
7371debfc3dSmrg   /* It always misses if delta is greater than or equal to the cache
7381debfc3dSmrg      line size.  */
7391debfc3dSmrg   if (delta >= (HOST_WIDE_INT) cache_line_size)
7401debfc3dSmrg     return false;
7411debfc3dSmrg 
7421debfc3dSmrg   miss_positions = 0;
7431debfc3dSmrg   total_positions = (cache_line_size / align_unit) * distinct_iters;
7441debfc3dSmrg   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
7451debfc3dSmrg 
7461debfc3dSmrg   /* Iterate through all possible alignments of the first
7471debfc3dSmrg      memory reference within its cache line.  */
7481debfc3dSmrg   for (align = 0; align < cache_line_size; align += align_unit)
7491debfc3dSmrg 
7501debfc3dSmrg     /* Iterate through all distinct iterations.  */
7511debfc3dSmrg     for (iter = 0; iter < distinct_iters; iter++)
7521debfc3dSmrg       {
7531debfc3dSmrg 	address1 = align + step * iter;
7541debfc3dSmrg 	address2 = address1 + delta;
7551debfc3dSmrg 	cache_line1 = address1 / cache_line_size;
7561debfc3dSmrg 	cache_line2 = address2 / cache_line_size;
7571debfc3dSmrg 	if (cache_line1 != cache_line2)
7581debfc3dSmrg 	  {
7591debfc3dSmrg 	    miss_positions += 1;
7601debfc3dSmrg             if (miss_positions > max_allowed_miss_positions)
7611debfc3dSmrg 	      return false;
7621debfc3dSmrg           }
7631debfc3dSmrg       }
7641debfc3dSmrg   return true;
7651debfc3dSmrg }
7661debfc3dSmrg 
7671debfc3dSmrg /* Prune the prefetch candidate REF using the reuse with BY.
7681debfc3dSmrg    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
7691debfc3dSmrg 
7701debfc3dSmrg static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)7711debfc3dSmrg prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
7721debfc3dSmrg 			  bool by_is_before)
7731debfc3dSmrg {
7741debfc3dSmrg   HOST_WIDE_INT step;
7751debfc3dSmrg   bool backward;
7761debfc3dSmrg   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
7771debfc3dSmrg   HOST_WIDE_INT delta = delta_b - delta_r;
7781debfc3dSmrg   HOST_WIDE_INT hit_from;
7791debfc3dSmrg   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
7801debfc3dSmrg   HOST_WIDE_INT reduced_step;
7811debfc3dSmrg   unsigned HOST_WIDE_INT reduced_prefetch_block;
7821debfc3dSmrg   tree ref_type;
7831debfc3dSmrg   int align_unit;
7841debfc3dSmrg 
7851debfc3dSmrg   /* If the step is non constant we cannot calculate prefetch_before.  */
7861debfc3dSmrg   if (!cst_and_fits_in_hwi (ref->group->step)) {
7871debfc3dSmrg     return;
7881debfc3dSmrg   }
7891debfc3dSmrg 
7901debfc3dSmrg   step = int_cst_value (ref->group->step);
7911debfc3dSmrg 
7921debfc3dSmrg   backward = step < 0;
7931debfc3dSmrg 
7941debfc3dSmrg 
7951debfc3dSmrg   if (delta == 0)
7961debfc3dSmrg     {
7971debfc3dSmrg       /* If the references has the same address, only prefetch the
7981debfc3dSmrg 	 former.  */
7991debfc3dSmrg       if (by_is_before)
8001debfc3dSmrg 	ref->prefetch_before = 0;
8011debfc3dSmrg 
8021debfc3dSmrg       return;
8031debfc3dSmrg     }
8041debfc3dSmrg 
8051debfc3dSmrg   if (!step)
8061debfc3dSmrg     {
8071debfc3dSmrg       /* If the reference addresses are invariant and fall into the
8081debfc3dSmrg 	 same cache line, prefetch just the first one.  */
8091debfc3dSmrg       if (!by_is_before)
8101debfc3dSmrg 	return;
8111debfc3dSmrg 
8121debfc3dSmrg       if (ddown (ref->delta, PREFETCH_BLOCK)
8131debfc3dSmrg 	  != ddown (by->delta, PREFETCH_BLOCK))
8141debfc3dSmrg 	return;
8151debfc3dSmrg 
8161debfc3dSmrg       ref->prefetch_before = 0;
8171debfc3dSmrg       return;
8181debfc3dSmrg     }
8191debfc3dSmrg 
8201debfc3dSmrg   /* Only prune the reference that is behind in the array.  */
8211debfc3dSmrg   if (backward)
8221debfc3dSmrg     {
8231debfc3dSmrg       if (delta > 0)
8241debfc3dSmrg 	return;
8251debfc3dSmrg 
8261debfc3dSmrg       /* Transform the data so that we may assume that the accesses
8271debfc3dSmrg 	 are forward.  */
8281debfc3dSmrg       delta = - delta;
8291debfc3dSmrg       step = -step;
8301debfc3dSmrg       delta_r = PREFETCH_BLOCK - 1 - delta_r;
8311debfc3dSmrg       delta_b = PREFETCH_BLOCK - 1 - delta_b;
8321debfc3dSmrg     }
8331debfc3dSmrg   else
8341debfc3dSmrg     {
8351debfc3dSmrg       if (delta < 0)
8361debfc3dSmrg 	return;
8371debfc3dSmrg     }
8381debfc3dSmrg 
8391debfc3dSmrg   /* Check whether the two references are likely to hit the same cache
8401debfc3dSmrg      line, and how distant the iterations in that it occurs are from
8411debfc3dSmrg      each other.  */
8421debfc3dSmrg 
8431debfc3dSmrg   if (step <= PREFETCH_BLOCK)
8441debfc3dSmrg     {
8451debfc3dSmrg       /* The accesses are sure to meet.  Let us check when.  */
8461debfc3dSmrg       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
8471debfc3dSmrg       prefetch_before = (hit_from - delta_r + step - 1) / step;
8481debfc3dSmrg 
8491debfc3dSmrg       /* Do not reduce prefetch_before if we meet beyond cache size.  */
8501debfc3dSmrg       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
8511debfc3dSmrg         prefetch_before = PREFETCH_ALL;
8521debfc3dSmrg       if (prefetch_before < ref->prefetch_before)
8531debfc3dSmrg 	ref->prefetch_before = prefetch_before;
8541debfc3dSmrg 
8551debfc3dSmrg       return;
8561debfc3dSmrg     }
8571debfc3dSmrg 
8581debfc3dSmrg   /* A more complicated case with step > prefetch_block.  First reduce
8591debfc3dSmrg      the ratio between the step and the cache line size to its simplest
8601debfc3dSmrg      terms.  The resulting denominator will then represent the number of
8611debfc3dSmrg      distinct iterations after which each address will go back to its
8621debfc3dSmrg      initial location within the cache line.  This computation assumes
8631debfc3dSmrg      that PREFETCH_BLOCK is a power of two.  */
8641debfc3dSmrg   prefetch_block = PREFETCH_BLOCK;
8651debfc3dSmrg   reduced_prefetch_block = prefetch_block;
8661debfc3dSmrg   reduced_step = step;
8671debfc3dSmrg   while ((reduced_step & 1) == 0
8681debfc3dSmrg 	 && reduced_prefetch_block > 1)
8691debfc3dSmrg     {
8701debfc3dSmrg       reduced_step >>= 1;
8711debfc3dSmrg       reduced_prefetch_block >>= 1;
8721debfc3dSmrg     }
8731debfc3dSmrg 
8741debfc3dSmrg   prefetch_before = delta / step;
8751debfc3dSmrg   delta %= step;
8761debfc3dSmrg   ref_type = TREE_TYPE (ref->mem);
8771debfc3dSmrg   align_unit = TYPE_ALIGN (ref_type) / 8;
8781debfc3dSmrg   if (is_miss_rate_acceptable (prefetch_block, step, delta,
8791debfc3dSmrg 			       reduced_prefetch_block, align_unit))
8801debfc3dSmrg     {
8811debfc3dSmrg       /* Do not reduce prefetch_before if we meet beyond cache size.  */
8821debfc3dSmrg       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
8831debfc3dSmrg         prefetch_before = PREFETCH_ALL;
8841debfc3dSmrg       if (prefetch_before < ref->prefetch_before)
8851debfc3dSmrg 	ref->prefetch_before = prefetch_before;
8861debfc3dSmrg 
8871debfc3dSmrg       return;
8881debfc3dSmrg     }
8891debfc3dSmrg 
8901debfc3dSmrg   /* Try also the following iteration.  */
8911debfc3dSmrg   prefetch_before++;
8921debfc3dSmrg   delta = step - delta;
8931debfc3dSmrg   if (is_miss_rate_acceptable (prefetch_block, step, delta,
8941debfc3dSmrg 			       reduced_prefetch_block, align_unit))
8951debfc3dSmrg     {
8961debfc3dSmrg       if (prefetch_before < ref->prefetch_before)
8971debfc3dSmrg 	ref->prefetch_before = prefetch_before;
8981debfc3dSmrg 
8991debfc3dSmrg       return;
9001debfc3dSmrg     }
9011debfc3dSmrg 
9021debfc3dSmrg   /* The ref probably does not reuse by.  */
9031debfc3dSmrg   return;
9041debfc3dSmrg }
9051debfc3dSmrg 
9061debfc3dSmrg /* Prune the prefetch candidate REF using the reuses with other references
9071debfc3dSmrg    in REFS.  */
9081debfc3dSmrg 
9091debfc3dSmrg static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)9101debfc3dSmrg prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
9111debfc3dSmrg {
9121debfc3dSmrg   struct mem_ref *prune_by;
9131debfc3dSmrg   bool before = true;
9141debfc3dSmrg 
9151debfc3dSmrg   prune_ref_by_self_reuse (ref);
9161debfc3dSmrg 
9171debfc3dSmrg   for (prune_by = refs; prune_by; prune_by = prune_by->next)
9181debfc3dSmrg     {
9191debfc3dSmrg       if (prune_by == ref)
9201debfc3dSmrg 	{
9211debfc3dSmrg 	  before = false;
9221debfc3dSmrg 	  continue;
9231debfc3dSmrg 	}
9241debfc3dSmrg 
9251debfc3dSmrg       if (!WRITE_CAN_USE_READ_PREFETCH
9261debfc3dSmrg 	  && ref->write_p
9271debfc3dSmrg 	  && !prune_by->write_p)
9281debfc3dSmrg 	continue;
9291debfc3dSmrg       if (!READ_CAN_USE_WRITE_PREFETCH
9301debfc3dSmrg 	  && !ref->write_p
9311debfc3dSmrg 	  && prune_by->write_p)
9321debfc3dSmrg 	continue;
9331debfc3dSmrg 
9341debfc3dSmrg       prune_ref_by_group_reuse (ref, prune_by, before);
9351debfc3dSmrg     }
9361debfc3dSmrg }
9371debfc3dSmrg 
9381debfc3dSmrg /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
9391debfc3dSmrg 
9401debfc3dSmrg static void
prune_group_by_reuse(struct mem_ref_group * group)9411debfc3dSmrg prune_group_by_reuse (struct mem_ref_group *group)
9421debfc3dSmrg {
9431debfc3dSmrg   struct mem_ref *ref_pruned;
9441debfc3dSmrg 
9451debfc3dSmrg   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
9461debfc3dSmrg     {
9471debfc3dSmrg       prune_ref_by_reuse (ref_pruned, group->refs);
9481debfc3dSmrg 
9491debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
9501debfc3dSmrg 	{
951a2dc1f3fSmrg 	  dump_mem_ref (dump_file, ref_pruned);
9521debfc3dSmrg 
9531debfc3dSmrg 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
9541debfc3dSmrg 	      && ref_pruned->prefetch_mod == 1)
9551debfc3dSmrg 	    fprintf (dump_file, " no restrictions");
9561debfc3dSmrg 	  else if (ref_pruned->prefetch_before == 0)
9571debfc3dSmrg 	    fprintf (dump_file, " do not prefetch");
9581debfc3dSmrg 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
9591debfc3dSmrg 	    fprintf (dump_file, " prefetch once");
9601debfc3dSmrg 	  else
9611debfc3dSmrg 	    {
9621debfc3dSmrg 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
9631debfc3dSmrg 		{
9641debfc3dSmrg 		  fprintf (dump_file, " prefetch before ");
9651debfc3dSmrg 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
9661debfc3dSmrg 			   ref_pruned->prefetch_before);
9671debfc3dSmrg 		}
9681debfc3dSmrg 	      if (ref_pruned->prefetch_mod != 1)
9691debfc3dSmrg 		{
9701debfc3dSmrg 		  fprintf (dump_file, " prefetch mod ");
9711debfc3dSmrg 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
9721debfc3dSmrg 			   ref_pruned->prefetch_mod);
9731debfc3dSmrg 		}
9741debfc3dSmrg 	    }
9751debfc3dSmrg 	  fprintf (dump_file, "\n");
9761debfc3dSmrg 	}
9771debfc3dSmrg     }
9781debfc3dSmrg }
9791debfc3dSmrg 
9801debfc3dSmrg /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
9811debfc3dSmrg 
9821debfc3dSmrg static void
prune_by_reuse(struct mem_ref_group * groups)9831debfc3dSmrg prune_by_reuse (struct mem_ref_group *groups)
9841debfc3dSmrg {
9851debfc3dSmrg   for (; groups; groups = groups->next)
9861debfc3dSmrg     prune_group_by_reuse (groups);
9871debfc3dSmrg }
9881debfc3dSmrg 
9891debfc3dSmrg /* Returns true if we should issue prefetch for REF.  */
9901debfc3dSmrg 
9911debfc3dSmrg static bool
should_issue_prefetch_p(struct mem_ref * ref)9921debfc3dSmrg should_issue_prefetch_p (struct mem_ref *ref)
9931debfc3dSmrg {
994c0a68be4Smrg   /* Do we want to issue prefetches for non-constant strides?  */
995*8feb0f0bSmrg   if (!cst_and_fits_in_hwi (ref->group->step)
996*8feb0f0bSmrg       && param_prefetch_dynamic_strides == 0)
997c0a68be4Smrg     {
998c0a68be4Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
999c0a68be4Smrg 	fprintf (dump_file,
1000c0a68be4Smrg 		 "Skipping non-constant step for reference %u:%u\n",
1001c0a68be4Smrg 		 ref->group->uid, ref->uid);
1002c0a68be4Smrg       return false;
1003c0a68be4Smrg     }
1004c0a68be4Smrg 
1005c0a68be4Smrg   /* Some processors may have a hardware prefetcher that may conflict with
1006c0a68be4Smrg      prefetch hints for a range of strides.  Make sure we don't issue
1007c0a68be4Smrg      prefetches for such cases if the stride is within this particular
1008c0a68be4Smrg      range.  */
1009c0a68be4Smrg   if (cst_and_fits_in_hwi (ref->group->step)
1010c0a68be4Smrg       && abs_hwi (int_cst_value (ref->group->step))
1011*8feb0f0bSmrg 	  < (HOST_WIDE_INT) param_prefetch_minimum_stride)
1012c0a68be4Smrg     {
1013c0a68be4Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
1014c0a68be4Smrg 	fprintf (dump_file,
1015c0a68be4Smrg 		 "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
1016c0a68be4Smrg 		 ") is less than the mininum required stride of %d\n",
1017c0a68be4Smrg 		 ref->group->uid, ref->uid, int_cst_value (ref->group->step),
1018*8feb0f0bSmrg 		 param_prefetch_minimum_stride);
1019c0a68be4Smrg       return false;
1020c0a68be4Smrg     }
1021c0a68be4Smrg 
10221debfc3dSmrg   /* For now do not issue prefetches for only first few of the
10231debfc3dSmrg      iterations.  */
10241debfc3dSmrg   if (ref->prefetch_before != PREFETCH_ALL)
10251debfc3dSmrg     {
10261debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
1027a2dc1f3fSmrg         fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1028a2dc1f3fSmrg 		 ref->group->uid, ref->uid);
10291debfc3dSmrg       return false;
10301debfc3dSmrg     }
10311debfc3dSmrg 
10321debfc3dSmrg   /* Do not prefetch nontemporal stores.  */
10331debfc3dSmrg   if (ref->storent_p)
10341debfc3dSmrg     {
10351debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
1036a2dc1f3fSmrg         fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
10371debfc3dSmrg       return false;
10381debfc3dSmrg     }
10391debfc3dSmrg 
10401debfc3dSmrg   return true;
10411debfc3dSmrg }
10421debfc3dSmrg 
10431debfc3dSmrg /* Decide which of the prefetch candidates in GROUPS to prefetch.
10441debfc3dSmrg    AHEAD is the number of iterations to prefetch ahead (which corresponds
10451debfc3dSmrg    to the number of simultaneous instances of one prefetch running at a
10461debfc3dSmrg    time).  UNROLL_FACTOR is the factor by that the loop is going to be
10471debfc3dSmrg    unrolled.  Returns true if there is anything to prefetch.  */
10481debfc3dSmrg 
10491debfc3dSmrg static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)10501debfc3dSmrg schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
10511debfc3dSmrg 		     unsigned ahead)
10521debfc3dSmrg {
10531debfc3dSmrg   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
10541debfc3dSmrg   unsigned slots_per_prefetch;
10551debfc3dSmrg   struct mem_ref *ref;
10561debfc3dSmrg   bool any = false;
10571debfc3dSmrg 
1058*8feb0f0bSmrg   /* At most param_simultaneous_prefetches should be running
1059*8feb0f0bSmrg      at the same time.  */
1060*8feb0f0bSmrg   remaining_prefetch_slots = param_simultaneous_prefetches;
10611debfc3dSmrg 
10621debfc3dSmrg   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
10631debfc3dSmrg      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
10641debfc3dSmrg      it will need a prefetch slot.  */
10651debfc3dSmrg   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
10661debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
10671debfc3dSmrg     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
10681debfc3dSmrg 	     slots_per_prefetch);
10691debfc3dSmrg 
10701debfc3dSmrg   /* For now we just take memory references one by one and issue
10711debfc3dSmrg      prefetches for as many as possible.  The groups are sorted
10721debfc3dSmrg      starting with the largest step, since the references with
10731debfc3dSmrg      large step are more likely to cause many cache misses.  */
10741debfc3dSmrg 
10751debfc3dSmrg   for (; groups; groups = groups->next)
10761debfc3dSmrg     for (ref = groups->refs; ref; ref = ref->next)
10771debfc3dSmrg       {
10781debfc3dSmrg 	if (!should_issue_prefetch_p (ref))
10791debfc3dSmrg 	  continue;
10801debfc3dSmrg 
10811debfc3dSmrg         /* The loop is far from being sufficiently unrolled for this
10821debfc3dSmrg            prefetch.  Do not generate prefetch to avoid many redudant
10831debfc3dSmrg            prefetches.  */
10841debfc3dSmrg         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
10851debfc3dSmrg           continue;
10861debfc3dSmrg 
10871debfc3dSmrg 	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
10881debfc3dSmrg 	   and we unroll the loop UNROLL_FACTOR times, we need to insert
10891debfc3dSmrg 	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
10901debfc3dSmrg 	   iteration.  */
10911debfc3dSmrg 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
10921debfc3dSmrg 			/ ref->prefetch_mod);
10931debfc3dSmrg 	prefetch_slots = n_prefetches * slots_per_prefetch;
10941debfc3dSmrg 
10951debfc3dSmrg 	/* If more than half of the prefetches would be lost anyway, do not
10961debfc3dSmrg 	   issue the prefetch.  */
10971debfc3dSmrg 	if (2 * remaining_prefetch_slots < prefetch_slots)
10981debfc3dSmrg 	  continue;
10991debfc3dSmrg 
1100a2dc1f3fSmrg 	/* Stop prefetching if debug counter is activated.  */
1101a2dc1f3fSmrg 	if (!dbg_cnt (prefetch))
1102a2dc1f3fSmrg 	  continue;
1103a2dc1f3fSmrg 
11041debfc3dSmrg 	ref->issue_prefetch_p = true;
1105a2dc1f3fSmrg 	if (dump_file && (dump_flags & TDF_DETAILS))
1106a2dc1f3fSmrg 	  fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1107a2dc1f3fSmrg 		   ref->group->uid, ref->uid);
11081debfc3dSmrg 
11091debfc3dSmrg 	if (remaining_prefetch_slots <= prefetch_slots)
11101debfc3dSmrg 	  return true;
11111debfc3dSmrg 	remaining_prefetch_slots -= prefetch_slots;
11121debfc3dSmrg 	any = true;
11131debfc3dSmrg       }
11141debfc3dSmrg 
11151debfc3dSmrg   return any;
11161debfc3dSmrg }
11171debfc3dSmrg 
11181debfc3dSmrg /* Return TRUE if no prefetch is going to be generated in the given
11191debfc3dSmrg    GROUPS.  */
11201debfc3dSmrg 
11211debfc3dSmrg static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)11221debfc3dSmrg nothing_to_prefetch_p (struct mem_ref_group *groups)
11231debfc3dSmrg {
11241debfc3dSmrg   struct mem_ref *ref;
11251debfc3dSmrg 
11261debfc3dSmrg   for (; groups; groups = groups->next)
11271debfc3dSmrg     for (ref = groups->refs; ref; ref = ref->next)
11281debfc3dSmrg       if (should_issue_prefetch_p (ref))
11291debfc3dSmrg 	return false;
11301debfc3dSmrg 
11311debfc3dSmrg   return true;
11321debfc3dSmrg }
11331debfc3dSmrg 
11341debfc3dSmrg /* Estimate the number of prefetches in the given GROUPS.
11351debfc3dSmrg    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
11361debfc3dSmrg 
11371debfc3dSmrg static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)11381debfc3dSmrg estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
11391debfc3dSmrg {
11401debfc3dSmrg   struct mem_ref *ref;
11411debfc3dSmrg   unsigned n_prefetches;
11421debfc3dSmrg   int prefetch_count = 0;
11431debfc3dSmrg 
11441debfc3dSmrg   for (; groups; groups = groups->next)
11451debfc3dSmrg     for (ref = groups->refs; ref; ref = ref->next)
11461debfc3dSmrg       if (should_issue_prefetch_p (ref))
11471debfc3dSmrg 	{
11481debfc3dSmrg 	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
11491debfc3dSmrg 			  / ref->prefetch_mod);
11501debfc3dSmrg 	  prefetch_count += n_prefetches;
11511debfc3dSmrg 	}
11521debfc3dSmrg 
11531debfc3dSmrg   return prefetch_count;
11541debfc3dSmrg }
11551debfc3dSmrg 
11561debfc3dSmrg /* Issue prefetches for the reference REF into loop as decided before.
11571debfc3dSmrg    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
11581debfc3dSmrg    is the factor by which LOOP was unrolled.  */
11591debfc3dSmrg 
11601debfc3dSmrg static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)11611debfc3dSmrg issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
11621debfc3dSmrg {
11631debfc3dSmrg   HOST_WIDE_INT delta;
11641debfc3dSmrg   tree addr, addr_base, write_p, local, forward;
11651debfc3dSmrg   gcall *prefetch;
11661debfc3dSmrg   gimple_stmt_iterator bsi;
11671debfc3dSmrg   unsigned n_prefetches, ap;
11681debfc3dSmrg   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
11691debfc3dSmrg 
11701debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
1171a2dc1f3fSmrg     fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
11721debfc3dSmrg 	     nontemporal ? " nontemporal" : "",
1173a2dc1f3fSmrg 	     ref->group->uid, ref->uid);
11741debfc3dSmrg 
11751debfc3dSmrg   bsi = gsi_for_stmt (ref->stmt);
11761debfc3dSmrg 
11771debfc3dSmrg   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
11781debfc3dSmrg 		  / ref->prefetch_mod);
11791debfc3dSmrg   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
11801debfc3dSmrg   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
11811debfc3dSmrg 					true, NULL, true, GSI_SAME_STMT);
11821debfc3dSmrg   write_p = ref->write_p ? integer_one_node : integer_zero_node;
11831debfc3dSmrg   local = nontemporal ? integer_zero_node : integer_three_node;
11841debfc3dSmrg 
11851debfc3dSmrg   for (ap = 0; ap < n_prefetches; ap++)
11861debfc3dSmrg     {
11871debfc3dSmrg       if (cst_and_fits_in_hwi (ref->group->step))
11881debfc3dSmrg         {
11891debfc3dSmrg           /* Determine the address to prefetch.  */
11901debfc3dSmrg           delta = (ahead + ap * ref->prefetch_mod) *
11911debfc3dSmrg 		   int_cst_value (ref->group->step);
11921debfc3dSmrg           addr = fold_build_pointer_plus_hwi (addr_base, delta);
1193a2dc1f3fSmrg           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1194a2dc1f3fSmrg 					   NULL, true, GSI_SAME_STMT);
11951debfc3dSmrg         }
11961debfc3dSmrg       else
11971debfc3dSmrg         {
11981debfc3dSmrg           /* The step size is non-constant but loop-invariant.  We use the
11991debfc3dSmrg              heuristic to simply prefetch ahead iterations ahead.  */
12001debfc3dSmrg           forward = fold_build2 (MULT_EXPR, sizetype,
12011debfc3dSmrg                                  fold_convert (sizetype, ref->group->step),
12021debfc3dSmrg                                  fold_convert (sizetype, size_int (ahead)));
12031debfc3dSmrg           addr = fold_build_pointer_plus (addr_base, forward);
12041debfc3dSmrg           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
12051debfc3dSmrg 					   NULL, true, GSI_SAME_STMT);
12061debfc3dSmrg       }
12071debfc3dSmrg 
12081debfc3dSmrg       if (addr_base != addr
12091debfc3dSmrg 	  && TREE_CODE (addr_base) == SSA_NAME
12101debfc3dSmrg 	  && TREE_CODE (addr) == SSA_NAME)
12111debfc3dSmrg 	{
12121debfc3dSmrg 	  duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
12131debfc3dSmrg 	  /* As this isn't a plain copy we have to reset alignment
12141debfc3dSmrg 	     information.  */
12151debfc3dSmrg 	  if (SSA_NAME_PTR_INFO (addr))
12161debfc3dSmrg 	    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
12171debfc3dSmrg 	}
12181debfc3dSmrg 
12191debfc3dSmrg       /* Create the prefetch instruction.  */
12201debfc3dSmrg       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
12211debfc3dSmrg 				    3, addr, write_p, local);
12221debfc3dSmrg       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
12231debfc3dSmrg     }
12241debfc3dSmrg }
12251debfc3dSmrg 
12261debfc3dSmrg /* Issue prefetches for the references in GROUPS into loop as decided before.
12271debfc3dSmrg    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
12281debfc3dSmrg    factor by that LOOP was unrolled.  */
12291debfc3dSmrg 
12301debfc3dSmrg static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)12311debfc3dSmrg issue_prefetches (struct mem_ref_group *groups,
12321debfc3dSmrg 		  unsigned unroll_factor, unsigned ahead)
12331debfc3dSmrg {
12341debfc3dSmrg   struct mem_ref *ref;
12351debfc3dSmrg 
12361debfc3dSmrg   for (; groups; groups = groups->next)
12371debfc3dSmrg     for (ref = groups->refs; ref; ref = ref->next)
12381debfc3dSmrg       if (ref->issue_prefetch_p)
12391debfc3dSmrg 	issue_prefetch_ref (ref, unroll_factor, ahead);
12401debfc3dSmrg }
12411debfc3dSmrg 
12421debfc3dSmrg /* Returns true if REF is a memory write for that a nontemporal store insn
12431debfc3dSmrg    can be used.  */
12441debfc3dSmrg 
12451debfc3dSmrg static bool
nontemporal_store_p(struct mem_ref * ref)12461debfc3dSmrg nontemporal_store_p (struct mem_ref *ref)
12471debfc3dSmrg {
12481debfc3dSmrg   machine_mode mode;
12491debfc3dSmrg   enum insn_code code;
12501debfc3dSmrg 
12511debfc3dSmrg   /* REF must be a write that is not reused.  We require it to be independent
12521debfc3dSmrg      on all other memory references in the loop, as the nontemporal stores may
12531debfc3dSmrg      be reordered with respect to other memory references.  */
12541debfc3dSmrg   if (!ref->write_p
12551debfc3dSmrg       || !ref->independent_p
12561debfc3dSmrg       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
12571debfc3dSmrg     return false;
12581debfc3dSmrg 
12591debfc3dSmrg   /* Check that we have the storent instruction for the mode.  */
12601debfc3dSmrg   mode = TYPE_MODE (TREE_TYPE (ref->mem));
12611debfc3dSmrg   if (mode == BLKmode)
12621debfc3dSmrg     return false;
12631debfc3dSmrg 
12641debfc3dSmrg   code = optab_handler (storent_optab, mode);
12651debfc3dSmrg   return code != CODE_FOR_nothing;
12661debfc3dSmrg }
12671debfc3dSmrg 
12681debfc3dSmrg /* If REF is a nontemporal store, we mark the corresponding modify statement
12691debfc3dSmrg    and return true.  Otherwise, we return false.  */
12701debfc3dSmrg 
12711debfc3dSmrg static bool
mark_nontemporal_store(struct mem_ref * ref)12721debfc3dSmrg mark_nontemporal_store (struct mem_ref *ref)
12731debfc3dSmrg {
12741debfc3dSmrg   if (!nontemporal_store_p (ref))
12751debfc3dSmrg     return false;
12761debfc3dSmrg 
12771debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
1278a2dc1f3fSmrg     fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1279a2dc1f3fSmrg 	     ref->group->uid, ref->uid);
12801debfc3dSmrg 
12811debfc3dSmrg   gimple_assign_set_nontemporal_move (ref->stmt, true);
12821debfc3dSmrg   ref->storent_p = true;
12831debfc3dSmrg 
12841debfc3dSmrg   return true;
12851debfc3dSmrg }
12861debfc3dSmrg 
12871debfc3dSmrg /* Issue a memory fence instruction after LOOP.  */
12881debfc3dSmrg 
12891debfc3dSmrg static void
emit_mfence_after_loop(class loop * loop)1290*8feb0f0bSmrg emit_mfence_after_loop (class loop *loop)
12911debfc3dSmrg {
12921debfc3dSmrg   vec<edge> exits = get_loop_exit_edges (loop);
12931debfc3dSmrg   edge exit;
12941debfc3dSmrg   gcall *call;
12951debfc3dSmrg   gimple_stmt_iterator bsi;
12961debfc3dSmrg   unsigned i;
12971debfc3dSmrg 
12981debfc3dSmrg   FOR_EACH_VEC_ELT (exits, i, exit)
12991debfc3dSmrg     {
13001debfc3dSmrg       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
13011debfc3dSmrg 
13021debfc3dSmrg       if (!single_pred_p (exit->dest)
13031debfc3dSmrg 	  /* If possible, we prefer not to insert the fence on other paths
13041debfc3dSmrg 	     in cfg.  */
13051debfc3dSmrg 	  && !(exit->flags & EDGE_ABNORMAL))
13061debfc3dSmrg 	split_loop_exit_edge (exit);
13071debfc3dSmrg       bsi = gsi_after_labels (exit->dest);
13081debfc3dSmrg 
13091debfc3dSmrg       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
13101debfc3dSmrg     }
13111debfc3dSmrg 
13121debfc3dSmrg   exits.release ();
13131debfc3dSmrg   update_ssa (TODO_update_ssa_only_virtuals);
13141debfc3dSmrg }
13151debfc3dSmrg 
13161debfc3dSmrg /* Returns true if we can use storent in loop, false otherwise.  */
13171debfc3dSmrg 
13181debfc3dSmrg static bool
may_use_storent_in_loop_p(class loop * loop)1319*8feb0f0bSmrg may_use_storent_in_loop_p (class loop *loop)
13201debfc3dSmrg {
13211debfc3dSmrg   bool ret = true;
13221debfc3dSmrg 
13231debfc3dSmrg   if (loop->inner != NULL)
13241debfc3dSmrg     return false;
13251debfc3dSmrg 
13261debfc3dSmrg   /* If we must issue a mfence insn after using storent, check that there
13271debfc3dSmrg      is a suitable place for it at each of the loop exits.  */
13281debfc3dSmrg   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
13291debfc3dSmrg     {
13301debfc3dSmrg       vec<edge> exits = get_loop_exit_edges (loop);
13311debfc3dSmrg       unsigned i;
13321debfc3dSmrg       edge exit;
13331debfc3dSmrg 
13341debfc3dSmrg       FOR_EACH_VEC_ELT (exits, i, exit)
13351debfc3dSmrg 	if ((exit->flags & EDGE_ABNORMAL)
13361debfc3dSmrg 	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
13371debfc3dSmrg 	  ret = false;
13381debfc3dSmrg 
13391debfc3dSmrg       exits.release ();
13401debfc3dSmrg     }
13411debfc3dSmrg 
13421debfc3dSmrg   return ret;
13431debfc3dSmrg }
13441debfc3dSmrg 
13451debfc3dSmrg /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
13461debfc3dSmrg    references in the loop.  */
13471debfc3dSmrg 
13481debfc3dSmrg static void
mark_nontemporal_stores(class loop * loop,struct mem_ref_group * groups)1349*8feb0f0bSmrg mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
13501debfc3dSmrg {
13511debfc3dSmrg   struct mem_ref *ref;
13521debfc3dSmrg   bool any = false;
13531debfc3dSmrg 
13541debfc3dSmrg   if (!may_use_storent_in_loop_p (loop))
13551debfc3dSmrg     return;
13561debfc3dSmrg 
13571debfc3dSmrg   for (; groups; groups = groups->next)
13581debfc3dSmrg     for (ref = groups->refs; ref; ref = ref->next)
13591debfc3dSmrg       any |= mark_nontemporal_store (ref);
13601debfc3dSmrg 
13611debfc3dSmrg   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
13621debfc3dSmrg     emit_mfence_after_loop (loop);
13631debfc3dSmrg }
13641debfc3dSmrg 
13651debfc3dSmrg /* Determines whether we can profitably unroll LOOP FACTOR times, and if
13661debfc3dSmrg    this is the case, fill in DESC by the description of number of
13671debfc3dSmrg    iterations.  */
13681debfc3dSmrg 
13691debfc3dSmrg static bool
should_unroll_loop_p(class loop * loop,class tree_niter_desc * desc,unsigned factor)1370*8feb0f0bSmrg should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
13711debfc3dSmrg 		      unsigned factor)
13721debfc3dSmrg {
13731debfc3dSmrg   if (!can_unroll_loop_p (loop, factor, desc))
13741debfc3dSmrg     return false;
13751debfc3dSmrg 
13761debfc3dSmrg   /* We only consider loops without control flow for unrolling.  This is not
13771debfc3dSmrg      a hard restriction -- tree_unroll_loop works with arbitrary loops
13781debfc3dSmrg      as well; but the unrolling/prefetching is usually more profitable for
13791debfc3dSmrg      loops consisting of a single basic block, and we want to limit the
13801debfc3dSmrg      code growth.  */
13811debfc3dSmrg   if (loop->num_nodes > 2)
13821debfc3dSmrg     return false;
13831debfc3dSmrg 
13841debfc3dSmrg   return true;
13851debfc3dSmrg }
13861debfc3dSmrg 
13871debfc3dSmrg /* Determine the coefficient by that unroll LOOP, from the information
13881debfc3dSmrg    contained in the list of memory references REFS.  Description of
1389a2dc1f3fSmrg    number of iterations of LOOP is stored to DESC.  NINSNS is the number of
13901debfc3dSmrg    insns of the LOOP.  EST_NITER is the estimated number of iterations of
13911debfc3dSmrg    the loop, or -1 if no estimate is available.  */
13921debfc3dSmrg 
13931debfc3dSmrg static unsigned
determine_unroll_factor(class loop * loop,struct mem_ref_group * refs,unsigned ninsns,class tree_niter_desc * desc,HOST_WIDE_INT est_niter)1394*8feb0f0bSmrg determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
1395*8feb0f0bSmrg 			 unsigned ninsns, class tree_niter_desc *desc,
13961debfc3dSmrg 			 HOST_WIDE_INT est_niter)
13971debfc3dSmrg {
13981debfc3dSmrg   unsigned upper_bound;
13991debfc3dSmrg   unsigned nfactor, factor, mod_constraint;
14001debfc3dSmrg   struct mem_ref_group *agp;
14011debfc3dSmrg   struct mem_ref *ref;
14021debfc3dSmrg 
14031debfc3dSmrg   /* First check whether the loop is not too large to unroll.  We ignore
14041debfc3dSmrg      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
14051debfc3dSmrg      from unrolling them enough to make exactly one cache line covered by each
14061debfc3dSmrg      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
14071debfc3dSmrg      us from unrolling the loops too many times in cases where we only expect
14081debfc3dSmrg      gains from better scheduling and decreasing loop overhead, which is not
14091debfc3dSmrg      the case here.  */
1410*8feb0f0bSmrg   upper_bound = param_max_unrolled_insns / ninsns;
14111debfc3dSmrg 
14121debfc3dSmrg   /* If we unrolled the loop more times than it iterates, the unrolled version
14131debfc3dSmrg      of the loop would be never entered.  */
14141debfc3dSmrg   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
14151debfc3dSmrg     upper_bound = est_niter;
14161debfc3dSmrg 
14171debfc3dSmrg   if (upper_bound <= 1)
14181debfc3dSmrg     return 1;
14191debfc3dSmrg 
14201debfc3dSmrg   /* Choose the factor so that we may prefetch each cache just once,
14211debfc3dSmrg      but bound the unrolling by UPPER_BOUND.  */
14221debfc3dSmrg   factor = 1;
14231debfc3dSmrg   for (agp = refs; agp; agp = agp->next)
14241debfc3dSmrg     for (ref = agp->refs; ref; ref = ref->next)
14251debfc3dSmrg       if (should_issue_prefetch_p (ref))
14261debfc3dSmrg 	{
14271debfc3dSmrg 	  mod_constraint = ref->prefetch_mod;
14281debfc3dSmrg 	  nfactor = least_common_multiple (mod_constraint, factor);
14291debfc3dSmrg 	  if (nfactor <= upper_bound)
14301debfc3dSmrg 	    factor = nfactor;
14311debfc3dSmrg 	}
14321debfc3dSmrg 
14331debfc3dSmrg   if (!should_unroll_loop_p (loop, desc, factor))
14341debfc3dSmrg     return 1;
14351debfc3dSmrg 
14361debfc3dSmrg   return factor;
14371debfc3dSmrg }
14381debfc3dSmrg 
14391debfc3dSmrg /* Returns the total volume of the memory references REFS, taking into account
14401debfc3dSmrg    reuses in the innermost loop and cache line size.  TODO -- we should also
14411debfc3dSmrg    take into account reuses across the iterations of the loops in the loop
14421debfc3dSmrg    nest.  */
14431debfc3dSmrg 
14441debfc3dSmrg static unsigned
volume_of_references(struct mem_ref_group * refs)14451debfc3dSmrg volume_of_references (struct mem_ref_group *refs)
14461debfc3dSmrg {
14471debfc3dSmrg   unsigned volume = 0;
14481debfc3dSmrg   struct mem_ref_group *gr;
14491debfc3dSmrg   struct mem_ref *ref;
14501debfc3dSmrg 
14511debfc3dSmrg   for (gr = refs; gr; gr = gr->next)
14521debfc3dSmrg     for (ref = gr->refs; ref; ref = ref->next)
14531debfc3dSmrg       {
14541debfc3dSmrg 	/* Almost always reuses another value?  */
14551debfc3dSmrg 	if (ref->prefetch_before != PREFETCH_ALL)
14561debfc3dSmrg 	  continue;
14571debfc3dSmrg 
14581debfc3dSmrg 	/* If several iterations access the same cache line, use the size of
14591debfc3dSmrg 	   the line divided by this number.  Otherwise, a cache line is
14601debfc3dSmrg 	   accessed in each iteration.  TODO -- in the latter case, we should
14611debfc3dSmrg 	   take the size of the reference into account, rounding it up on cache
14621debfc3dSmrg 	   line size multiple.  */
1463*8feb0f0bSmrg 	volume += param_l1_cache_line_size / ref->prefetch_mod;
14641debfc3dSmrg       }
14651debfc3dSmrg   return volume;
14661debfc3dSmrg }
14671debfc3dSmrg 
14681debfc3dSmrg /* Returns the volume of memory references accessed across VEC iterations of
14691debfc3dSmrg    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
14701debfc3dSmrg    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
14711debfc3dSmrg 
14721debfc3dSmrg static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)14731debfc3dSmrg volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
14741debfc3dSmrg {
14751debfc3dSmrg   unsigned i;
14761debfc3dSmrg 
14771debfc3dSmrg   for (i = 0; i < n; i++)
14781debfc3dSmrg     if (vec[i] != 0)
14791debfc3dSmrg       break;
14801debfc3dSmrg 
14811debfc3dSmrg   if (i == n)
14821debfc3dSmrg     return 0;
14831debfc3dSmrg 
14841debfc3dSmrg   gcc_assert (vec[i] > 0);
14851debfc3dSmrg 
14861debfc3dSmrg   /* We ignore the parts of the distance vector in subloops, since usually
14871debfc3dSmrg      the numbers of iterations are much smaller.  */
14881debfc3dSmrg   return loop_sizes[i] * vec[i];
14891debfc3dSmrg }
14901debfc3dSmrg 
14911debfc3dSmrg /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
14921debfc3dSmrg    at the position corresponding to the loop of the step.  N is the depth
14931debfc3dSmrg    of the considered loop nest, and, LOOP is its innermost loop.  */
14941debfc3dSmrg 
14951debfc3dSmrg static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,class loop * loop)14961debfc3dSmrg add_subscript_strides (tree access_fn, unsigned stride,
1497*8feb0f0bSmrg 		       HOST_WIDE_INT *strides, unsigned n, class loop *loop)
14981debfc3dSmrg {
1499*8feb0f0bSmrg   class loop *aloop;
15001debfc3dSmrg   tree step;
15011debfc3dSmrg   HOST_WIDE_INT astep;
15021debfc3dSmrg   unsigned min_depth = loop_depth (loop) - n;
15031debfc3dSmrg 
15041debfc3dSmrg   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
15051debfc3dSmrg     {
15061debfc3dSmrg       aloop = get_chrec_loop (access_fn);
15071debfc3dSmrg       step = CHREC_RIGHT (access_fn);
15081debfc3dSmrg       access_fn = CHREC_LEFT (access_fn);
15091debfc3dSmrg 
15101debfc3dSmrg       if ((unsigned) loop_depth (aloop) <= min_depth)
15111debfc3dSmrg 	continue;
15121debfc3dSmrg 
15131debfc3dSmrg       if (tree_fits_shwi_p (step))
15141debfc3dSmrg 	astep = tree_to_shwi (step);
15151debfc3dSmrg       else
1516*8feb0f0bSmrg 	astep = param_l1_cache_line_size;
15171debfc3dSmrg 
15181debfc3dSmrg       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
15191debfc3dSmrg 
15201debfc3dSmrg     }
15211debfc3dSmrg }
15221debfc3dSmrg 
15231debfc3dSmrg /* Returns the volume of memory references accessed between two consecutive
15241debfc3dSmrg    self-reuses of the reference DR.  We consider the subscripts of DR in N
15251debfc3dSmrg    loops, and LOOP_SIZES contains the volumes of accesses in each of the
15261debfc3dSmrg    loops.  LOOP is the innermost loop of the current loop nest.  */
15271debfc3dSmrg 
15281debfc3dSmrg static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,class loop * loop)15291debfc3dSmrg self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1530*8feb0f0bSmrg 		     class loop *loop)
15311debfc3dSmrg {
15321debfc3dSmrg   tree stride, access_fn;
15331debfc3dSmrg   HOST_WIDE_INT *strides, astride;
15341debfc3dSmrg   vec<tree> access_fns;
15351debfc3dSmrg   tree ref = DR_REF (dr);
15361debfc3dSmrg   unsigned i, ret = ~0u;
15371debfc3dSmrg 
15381debfc3dSmrg   /* In the following example:
15391debfc3dSmrg 
15401debfc3dSmrg      for (i = 0; i < N; i++)
15411debfc3dSmrg        for (j = 0; j < N; j++)
15421debfc3dSmrg          use (a[j][i]);
15431debfc3dSmrg      the same cache line is accessed each N steps (except if the change from
15441debfc3dSmrg      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
15451debfc3dSmrg      we cannot rely purely on the results of the data dependence analysis.
15461debfc3dSmrg 
15471debfc3dSmrg      Instead, we compute the stride of the reference in each loop, and consider
15481debfc3dSmrg      the innermost loop in that the stride is less than cache size.  */
15491debfc3dSmrg 
15501debfc3dSmrg   strides = XCNEWVEC (HOST_WIDE_INT, n);
15511debfc3dSmrg   access_fns = DR_ACCESS_FNS (dr);
15521debfc3dSmrg 
15531debfc3dSmrg   FOR_EACH_VEC_ELT (access_fns, i, access_fn)
15541debfc3dSmrg     {
15551debfc3dSmrg       /* Keep track of the reference corresponding to the subscript, so that we
15561debfc3dSmrg 	 know its stride.  */
15571debfc3dSmrg       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
15581debfc3dSmrg 	ref = TREE_OPERAND (ref, 0);
15591debfc3dSmrg 
15601debfc3dSmrg       if (TREE_CODE (ref) == ARRAY_REF)
15611debfc3dSmrg 	{
15621debfc3dSmrg 	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
15631debfc3dSmrg 	  if (tree_fits_uhwi_p (stride))
15641debfc3dSmrg 	    astride = tree_to_uhwi (stride);
15651debfc3dSmrg 	  else
1566*8feb0f0bSmrg 	    astride = param_l1_cache_line_size;
15671debfc3dSmrg 
15681debfc3dSmrg 	  ref = TREE_OPERAND (ref, 0);
15691debfc3dSmrg 	}
15701debfc3dSmrg       else
15711debfc3dSmrg 	astride = 1;
15721debfc3dSmrg 
15731debfc3dSmrg       add_subscript_strides (access_fn, astride, strides, n, loop);
15741debfc3dSmrg     }
15751debfc3dSmrg 
15761debfc3dSmrg   for (i = n; i-- > 0; )
15771debfc3dSmrg     {
15781debfc3dSmrg       unsigned HOST_WIDE_INT s;
15791debfc3dSmrg 
15801debfc3dSmrg       s = strides[i] < 0 ?  -strides[i] : strides[i];
15811debfc3dSmrg 
1582*8feb0f0bSmrg       if (s < (unsigned) param_l1_cache_line_size
15831debfc3dSmrg 	  && (loop_sizes[i]
15841debfc3dSmrg 	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
15851debfc3dSmrg 	{
15861debfc3dSmrg 	  ret = loop_sizes[i];
15871debfc3dSmrg 	  break;
15881debfc3dSmrg 	}
15891debfc3dSmrg     }
15901debfc3dSmrg 
15911debfc3dSmrg   free (strides);
15921debfc3dSmrg   return ret;
15931debfc3dSmrg }
15941debfc3dSmrg 
15951debfc3dSmrg /* Determines the distance till the first reuse of each reference in REFS
15961debfc3dSmrg    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
15971debfc3dSmrg    memory references in the loop.  Return false if the analysis fails.  */
15981debfc3dSmrg 
15991debfc3dSmrg static bool
determine_loop_nest_reuse(class loop * loop,struct mem_ref_group * refs,bool no_other_refs)1600*8feb0f0bSmrg determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs,
16011debfc3dSmrg 			   bool no_other_refs)
16021debfc3dSmrg {
1603*8feb0f0bSmrg   class loop *nest, *aloop;
16041debfc3dSmrg   vec<data_reference_p> datarefs = vNULL;
16051debfc3dSmrg   vec<ddr_p> dependences = vNULL;
16061debfc3dSmrg   struct mem_ref_group *gr;
16071debfc3dSmrg   struct mem_ref *ref, *refb;
16081debfc3dSmrg   auto_vec<loop_p> vloops;
16091debfc3dSmrg   unsigned *loop_data_size;
16101debfc3dSmrg   unsigned i, j, n;
16111debfc3dSmrg   unsigned volume, dist, adist;
16121debfc3dSmrg   HOST_WIDE_INT vol;
16131debfc3dSmrg   data_reference_p dr;
16141debfc3dSmrg   ddr_p dep;
16151debfc3dSmrg 
16161debfc3dSmrg   if (loop->inner)
16171debfc3dSmrg     return true;
16181debfc3dSmrg 
16191debfc3dSmrg   /* Find the outermost loop of the loop nest of loop (we require that
16201debfc3dSmrg      there are no sibling loops inside the nest).  */
16211debfc3dSmrg   nest = loop;
16221debfc3dSmrg   while (1)
16231debfc3dSmrg     {
16241debfc3dSmrg       aloop = loop_outer (nest);
16251debfc3dSmrg 
16261debfc3dSmrg       if (aloop == current_loops->tree_root
16271debfc3dSmrg 	  || aloop->inner->next)
16281debfc3dSmrg 	break;
16291debfc3dSmrg 
16301debfc3dSmrg       nest = aloop;
16311debfc3dSmrg     }
16321debfc3dSmrg 
16331debfc3dSmrg   /* For each loop, determine the amount of data accessed in each iteration.
16341debfc3dSmrg      We use this to estimate whether the reference is evicted from the
16351debfc3dSmrg      cache before its reuse.  */
16361debfc3dSmrg   find_loop_nest (nest, &vloops);
16371debfc3dSmrg   n = vloops.length ();
16381debfc3dSmrg   loop_data_size = XNEWVEC (unsigned, n);
16391debfc3dSmrg   volume = volume_of_references (refs);
16401debfc3dSmrg   i = n;
16411debfc3dSmrg   while (i-- != 0)
16421debfc3dSmrg     {
16431debfc3dSmrg       loop_data_size[i] = volume;
16441debfc3dSmrg       /* Bound the volume by the L2 cache size, since above this bound,
16451debfc3dSmrg 	 all dependence distances are equivalent.  */
16461debfc3dSmrg       if (volume > L2_CACHE_SIZE_BYTES)
16471debfc3dSmrg 	continue;
16481debfc3dSmrg 
16491debfc3dSmrg       aloop = vloops[i];
16501debfc3dSmrg       vol = estimated_stmt_executions_int (aloop);
16511debfc3dSmrg       if (vol == -1)
16521debfc3dSmrg 	vol = expected_loop_iterations (aloop);
16531debfc3dSmrg       volume *= vol;
16541debfc3dSmrg     }
16551debfc3dSmrg 
16561debfc3dSmrg   /* Prepare the references in the form suitable for data dependence
16571debfc3dSmrg      analysis.  We ignore unanalyzable data references (the results
16581debfc3dSmrg      are used just as a heuristics to estimate temporality of the
16591debfc3dSmrg      references, hence we do not need to worry about correctness).  */
16601debfc3dSmrg   for (gr = refs; gr; gr = gr->next)
16611debfc3dSmrg     for (ref = gr->refs; ref; ref = ref->next)
16621debfc3dSmrg       {
1663a2dc1f3fSmrg 	dr = create_data_ref (loop_preheader_edge (nest),
1664a2dc1f3fSmrg 			      loop_containing_stmt (ref->stmt),
1665a2dc1f3fSmrg 			      ref->mem, ref->stmt, !ref->write_p, false);
16661debfc3dSmrg 
16671debfc3dSmrg 	if (dr)
16681debfc3dSmrg 	  {
16691debfc3dSmrg 	    ref->reuse_distance = volume;
16701debfc3dSmrg 	    dr->aux = ref;
16711debfc3dSmrg 	    datarefs.safe_push (dr);
16721debfc3dSmrg 	  }
16731debfc3dSmrg 	else
16741debfc3dSmrg 	  no_other_refs = false;
16751debfc3dSmrg       }
16761debfc3dSmrg 
16771debfc3dSmrg   FOR_EACH_VEC_ELT (datarefs, i, dr)
16781debfc3dSmrg     {
16791debfc3dSmrg       dist = self_reuse_distance (dr, loop_data_size, n, loop);
16801debfc3dSmrg       ref = (struct mem_ref *) dr->aux;
16811debfc3dSmrg       if (ref->reuse_distance > dist)
16821debfc3dSmrg 	ref->reuse_distance = dist;
16831debfc3dSmrg 
16841debfc3dSmrg       if (no_other_refs)
16851debfc3dSmrg 	ref->independent_p = true;
16861debfc3dSmrg     }
16871debfc3dSmrg 
16881debfc3dSmrg   if (!compute_all_dependences (datarefs, &dependences, vloops, true))
16891debfc3dSmrg     return false;
16901debfc3dSmrg 
16911debfc3dSmrg   FOR_EACH_VEC_ELT (dependences, i, dep)
16921debfc3dSmrg     {
16931debfc3dSmrg       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
16941debfc3dSmrg 	continue;
16951debfc3dSmrg 
16961debfc3dSmrg       ref = (struct mem_ref *) DDR_A (dep)->aux;
16971debfc3dSmrg       refb = (struct mem_ref *) DDR_B (dep)->aux;
16981debfc3dSmrg 
16991debfc3dSmrg       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1700a2dc1f3fSmrg 	  || DDR_COULD_BE_INDEPENDENT_P (dep)
17011debfc3dSmrg 	  || DDR_NUM_DIST_VECTS (dep) == 0)
17021debfc3dSmrg 	{
17031debfc3dSmrg 	  /* If the dependence cannot be analyzed, assume that there might be
17041debfc3dSmrg 	     a reuse.  */
17051debfc3dSmrg 	  dist = 0;
17061debfc3dSmrg 
17071debfc3dSmrg 	  ref->independent_p = false;
17081debfc3dSmrg 	  refb->independent_p = false;
17091debfc3dSmrg 	}
17101debfc3dSmrg       else
17111debfc3dSmrg 	{
17121debfc3dSmrg 	  /* The distance vectors are normalized to be always lexicographically
17131debfc3dSmrg 	     positive, hence we cannot tell just from them whether DDR_A comes
17141debfc3dSmrg 	     before DDR_B or vice versa.  However, it is not important,
17151debfc3dSmrg 	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
17161debfc3dSmrg 	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
17171debfc3dSmrg 	     in cache (and marking it as nontemporal would not affect
17181debfc3dSmrg 	     anything).  */
17191debfc3dSmrg 
17201debfc3dSmrg 	  dist = volume;
17211debfc3dSmrg 	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
17221debfc3dSmrg 	    {
17231debfc3dSmrg 	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
17241debfc3dSmrg 					     loop_data_size, n);
17251debfc3dSmrg 
17261debfc3dSmrg 	      /* If this is a dependence in the innermost loop (i.e., the
17271debfc3dSmrg 		 distances in all superloops are zero) and it is not
17281debfc3dSmrg 		 the trivial self-dependence with distance zero, record that
17291debfc3dSmrg 		 the references are not completely independent.  */
17301debfc3dSmrg 	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
17311debfc3dSmrg 		  && (ref != refb
17321debfc3dSmrg 		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
17331debfc3dSmrg 		{
17341debfc3dSmrg 		  ref->independent_p = false;
17351debfc3dSmrg 		  refb->independent_p = false;
17361debfc3dSmrg 		}
17371debfc3dSmrg 
17381debfc3dSmrg 	      /* Ignore accesses closer than
17391debfc3dSmrg 		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
17401debfc3dSmrg 	      	 so that we use nontemporal prefetches e.g. if single memory
17411debfc3dSmrg 		 location is accessed several times in a single iteration of
17421debfc3dSmrg 		 the loop.  */
17431debfc3dSmrg 	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
17441debfc3dSmrg 		continue;
17451debfc3dSmrg 
17461debfc3dSmrg 	      if (adist < dist)
17471debfc3dSmrg 		dist = adist;
17481debfc3dSmrg 	    }
17491debfc3dSmrg 	}
17501debfc3dSmrg 
17511debfc3dSmrg       if (ref->reuse_distance > dist)
17521debfc3dSmrg 	ref->reuse_distance = dist;
17531debfc3dSmrg       if (refb->reuse_distance > dist)
17541debfc3dSmrg 	refb->reuse_distance = dist;
17551debfc3dSmrg     }
17561debfc3dSmrg 
17571debfc3dSmrg   free_dependence_relations (dependences);
17581debfc3dSmrg   free_data_refs (datarefs);
17591debfc3dSmrg   free (loop_data_size);
17601debfc3dSmrg 
17611debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
17621debfc3dSmrg     {
17631debfc3dSmrg       fprintf (dump_file, "Reuse distances:\n");
17641debfc3dSmrg       for (gr = refs; gr; gr = gr->next)
17651debfc3dSmrg 	for (ref = gr->refs; ref; ref = ref->next)
1766a2dc1f3fSmrg 	  fprintf (dump_file, " reference %u:%u distance %u\n",
1767a2dc1f3fSmrg 		   ref->group->uid, ref->uid, ref->reuse_distance);
17681debfc3dSmrg     }
17691debfc3dSmrg 
17701debfc3dSmrg   return true;
17711debfc3dSmrg }
17721debfc3dSmrg 
17731debfc3dSmrg /* Determine whether or not the trip count to ahead ratio is too small based
17741debfc3dSmrg    on prefitablility consideration.
17751debfc3dSmrg    AHEAD: the iteration ahead distance,
17761debfc3dSmrg    EST_NITER: the estimated trip count.  */
17771debfc3dSmrg 
17781debfc3dSmrg static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)17791debfc3dSmrg trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
17801debfc3dSmrg {
17811debfc3dSmrg   /* Assume trip count to ahead ratio is big enough if the trip count could not
17821debfc3dSmrg      be estimated at compile time.  */
17831debfc3dSmrg   if (est_niter < 0)
17841debfc3dSmrg     return false;
17851debfc3dSmrg 
17861debfc3dSmrg   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
17871debfc3dSmrg     {
17881debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
17891debfc3dSmrg 	fprintf (dump_file,
17901debfc3dSmrg 		 "Not prefetching -- loop estimated to roll only %d times\n",
17911debfc3dSmrg 		 (int) est_niter);
17921debfc3dSmrg       return true;
17931debfc3dSmrg     }
17941debfc3dSmrg 
17951debfc3dSmrg   return false;
17961debfc3dSmrg }
17971debfc3dSmrg 
17981debfc3dSmrg /* Determine whether or not the number of memory references in the loop is
17991debfc3dSmrg    reasonable based on the profitablity and compilation time considerations.
18001debfc3dSmrg    NINSNS: estimated number of instructions in the loop,
18011debfc3dSmrg    MEM_REF_COUNT: total number of memory references in the loop.  */
18021debfc3dSmrg 
18031debfc3dSmrg static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)18041debfc3dSmrg mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
18051debfc3dSmrg {
18061debfc3dSmrg   int insn_to_mem_ratio;
18071debfc3dSmrg 
18081debfc3dSmrg   if (mem_ref_count == 0)
18091debfc3dSmrg     return false;
18101debfc3dSmrg 
18111debfc3dSmrg   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
18121debfc3dSmrg      (compute_all_dependences) have high costs based on quadratic complexity.
18131debfc3dSmrg      To avoid huge compilation time, we give up prefetching if mem_ref_count
18141debfc3dSmrg      is too large.  */
18151debfc3dSmrg   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
18161debfc3dSmrg     return false;
18171debfc3dSmrg 
18181debfc3dSmrg   /* Prefetching improves performance by overlapping cache missing
18191debfc3dSmrg      memory accesses with CPU operations.  If the loop does not have
18201debfc3dSmrg      enough CPU operations to overlap with memory operations, prefetching
18211debfc3dSmrg      won't give a significant benefit.  One approximate way of checking
18221debfc3dSmrg      this is to require the ratio of instructions to memory references to
18231debfc3dSmrg      be above a certain limit.  This approximation works well in practice.
18241debfc3dSmrg      TODO: Implement a more precise computation by estimating the time
18251debfc3dSmrg      for each CPU or memory op in the loop. Time estimates for memory ops
18261debfc3dSmrg      should account for cache misses.  */
18271debfc3dSmrg   insn_to_mem_ratio = ninsns / mem_ref_count;
18281debfc3dSmrg 
1829*8feb0f0bSmrg   if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
18301debfc3dSmrg     {
18311debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
18321debfc3dSmrg         fprintf (dump_file,
18331debfc3dSmrg 		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
18341debfc3dSmrg 		 insn_to_mem_ratio);
18351debfc3dSmrg       return false;
18361debfc3dSmrg     }
18371debfc3dSmrg 
18381debfc3dSmrg   return true;
18391debfc3dSmrg }
18401debfc3dSmrg 
18411debfc3dSmrg /* Determine whether or not the instruction to prefetch ratio in the loop is
18421debfc3dSmrg    too small based on the profitablity consideration.
18431debfc3dSmrg    NINSNS: estimated number of instructions in the loop,
18441debfc3dSmrg    PREFETCH_COUNT: an estimate of the number of prefetches,
18451debfc3dSmrg    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
18461debfc3dSmrg 
18471debfc3dSmrg static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)18481debfc3dSmrg insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
18491debfc3dSmrg                                      unsigned unroll_factor)
18501debfc3dSmrg {
18511debfc3dSmrg   int insn_to_prefetch_ratio;
18521debfc3dSmrg 
18531debfc3dSmrg   /* Prefetching most likely causes performance degradation when the instruction
18541debfc3dSmrg      to prefetch ratio is too small.  Too many prefetch instructions in a loop
18551debfc3dSmrg      may reduce the I-cache performance.
18561debfc3dSmrg      (unroll_factor * ninsns) is used to estimate the number of instructions in
18571debfc3dSmrg      the unrolled loop.  This implementation is a bit simplistic -- the number
18581debfc3dSmrg      of issued prefetch instructions is also affected by unrolling.  So,
18591debfc3dSmrg      prefetch_mod and the unroll factor should be taken into account when
18601debfc3dSmrg      determining prefetch_count.  Also, the number of insns of the unrolled
18611debfc3dSmrg      loop will usually be significantly smaller than the number of insns of the
18621debfc3dSmrg      original loop * unroll_factor (at least the induction variable increases
18631debfc3dSmrg      and the exit branches will get eliminated), so it might be better to use
18641debfc3dSmrg      tree_estimate_loop_size + estimated_unrolled_size.  */
18651debfc3dSmrg   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1866*8feb0f0bSmrg   if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
18671debfc3dSmrg     {
18681debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
18691debfc3dSmrg         fprintf (dump_file,
18701debfc3dSmrg 		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
18711debfc3dSmrg 		 insn_to_prefetch_ratio);
18721debfc3dSmrg       return true;
18731debfc3dSmrg     }
18741debfc3dSmrg 
18751debfc3dSmrg   return false;
18761debfc3dSmrg }
18771debfc3dSmrg 
18781debfc3dSmrg 
18791debfc3dSmrg /* Issue prefetch instructions for array references in LOOP.  Returns
18801debfc3dSmrg    true if the LOOP was unrolled.  */
18811debfc3dSmrg 
18821debfc3dSmrg static bool
loop_prefetch_arrays(class loop * loop)1883*8feb0f0bSmrg loop_prefetch_arrays (class loop *loop)
18841debfc3dSmrg {
18851debfc3dSmrg   struct mem_ref_group *refs;
18861debfc3dSmrg   unsigned ahead, ninsns, time, unroll_factor;
18871debfc3dSmrg   HOST_WIDE_INT est_niter;
1888*8feb0f0bSmrg   class tree_niter_desc desc;
18891debfc3dSmrg   bool unrolled = false, no_other_refs;
18901debfc3dSmrg   unsigned prefetch_count;
18911debfc3dSmrg   unsigned mem_ref_count;
18921debfc3dSmrg 
18931debfc3dSmrg   if (optimize_loop_nest_for_size_p (loop))
18941debfc3dSmrg     {
18951debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
18961debfc3dSmrg 	fprintf (dump_file, "  ignored (cold area)\n");
18971debfc3dSmrg       return false;
18981debfc3dSmrg     }
18991debfc3dSmrg 
19001debfc3dSmrg   /* FIXME: the time should be weighted by the probabilities of the blocks in
19011debfc3dSmrg      the loop body.  */
19021debfc3dSmrg   time = tree_num_loop_insns (loop, &eni_time_weights);
19031debfc3dSmrg   if (time == 0)
19041debfc3dSmrg     return false;
19051debfc3dSmrg 
1906*8feb0f0bSmrg   ahead = (param_prefetch_latency + time - 1) / time;
19071debfc3dSmrg   est_niter = estimated_stmt_executions_int (loop);
19081debfc3dSmrg   if (est_niter == -1)
19091debfc3dSmrg     est_niter = likely_max_stmt_executions_int (loop);
19101debfc3dSmrg 
19111debfc3dSmrg   /* Prefetching is not likely to be profitable if the trip count to ahead
19121debfc3dSmrg      ratio is too small.  */
19131debfc3dSmrg   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
19141debfc3dSmrg     return false;
19151debfc3dSmrg 
19161debfc3dSmrg   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
19171debfc3dSmrg 
19181debfc3dSmrg   /* Step 1: gather the memory references.  */
19191debfc3dSmrg   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
19201debfc3dSmrg 
19211debfc3dSmrg   /* Give up prefetching if the number of memory references in the
19221debfc3dSmrg      loop is not reasonable based on profitablity and compilation time
19231debfc3dSmrg      considerations.  */
19241debfc3dSmrg   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
19251debfc3dSmrg     goto fail;
19261debfc3dSmrg 
19271debfc3dSmrg   /* Step 2: estimate the reuse effects.  */
19281debfc3dSmrg   prune_by_reuse (refs);
19291debfc3dSmrg 
19301debfc3dSmrg   if (nothing_to_prefetch_p (refs))
19311debfc3dSmrg     goto fail;
19321debfc3dSmrg 
19331debfc3dSmrg   if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
19341debfc3dSmrg     goto fail;
19351debfc3dSmrg 
19361debfc3dSmrg   /* Step 3: determine unroll factor.  */
19371debfc3dSmrg   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
19381debfc3dSmrg 					   est_niter);
19391debfc3dSmrg 
19401debfc3dSmrg   /* Estimate prefetch count for the unrolled loop.  */
19411debfc3dSmrg   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
19421debfc3dSmrg   if (prefetch_count == 0)
19431debfc3dSmrg     goto fail;
19441debfc3dSmrg 
19451debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
19461debfc3dSmrg     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
19471debfc3dSmrg 	     HOST_WIDE_INT_PRINT_DEC "\n"
19481debfc3dSmrg 	     "insn count %d, mem ref count %d, prefetch count %d\n",
19491debfc3dSmrg 	     ahead, unroll_factor, est_niter,
19501debfc3dSmrg 	     ninsns, mem_ref_count, prefetch_count);
19511debfc3dSmrg 
19521debfc3dSmrg   /* Prefetching is not likely to be profitable if the instruction to prefetch
19531debfc3dSmrg      ratio is too small.  */
19541debfc3dSmrg   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
19551debfc3dSmrg 					  unroll_factor))
19561debfc3dSmrg     goto fail;
19571debfc3dSmrg 
19581debfc3dSmrg   mark_nontemporal_stores (loop, refs);
19591debfc3dSmrg 
19601debfc3dSmrg   /* Step 4: what to prefetch?  */
19611debfc3dSmrg   if (!schedule_prefetches (refs, unroll_factor, ahead))
19621debfc3dSmrg     goto fail;
19631debfc3dSmrg 
19641debfc3dSmrg   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
19651debfc3dSmrg      iterations so that we do not issue superfluous prefetches.  */
19661debfc3dSmrg   if (unroll_factor != 1)
19671debfc3dSmrg     {
19681debfc3dSmrg       tree_unroll_loop (loop, unroll_factor,
19691debfc3dSmrg 			single_dom_exit (loop), &desc);
19701debfc3dSmrg       unrolled = true;
19711debfc3dSmrg     }
19721debfc3dSmrg 
19731debfc3dSmrg   /* Step 6: issue the prefetches.  */
19741debfc3dSmrg   issue_prefetches (refs, unroll_factor, ahead);
19751debfc3dSmrg 
19761debfc3dSmrg fail:
19771debfc3dSmrg   release_mem_refs (refs);
19781debfc3dSmrg   return unrolled;
19791debfc3dSmrg }
19801debfc3dSmrg 
19811debfc3dSmrg /* Issue prefetch instructions for array references in loops.  */
19821debfc3dSmrg 
19831debfc3dSmrg unsigned int
tree_ssa_prefetch_arrays(void)19841debfc3dSmrg tree_ssa_prefetch_arrays (void)
19851debfc3dSmrg {
1986*8feb0f0bSmrg   class loop *loop;
19871debfc3dSmrg   bool unrolled = false;
19881debfc3dSmrg   int todo_flags = 0;
19891debfc3dSmrg 
19901debfc3dSmrg   if (!targetm.have_prefetch ()
19911debfc3dSmrg       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
19921debfc3dSmrg 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
19931debfc3dSmrg 	 of processor costs and i486 does not have prefetch, but
19941debfc3dSmrg 	 -march=pentium4 causes targetm.have_prefetch to be true.  Ugh.  */
19951debfc3dSmrg       || PREFETCH_BLOCK == 0)
19961debfc3dSmrg     return 0;
19971debfc3dSmrg 
19981debfc3dSmrg   if (dump_file && (dump_flags & TDF_DETAILS))
19991debfc3dSmrg     {
20001debfc3dSmrg       fprintf (dump_file, "Prefetching parameters:\n");
20011debfc3dSmrg       fprintf (dump_file, "    simultaneous prefetches: %d\n",
2002*8feb0f0bSmrg 	       param_simultaneous_prefetches);
2003*8feb0f0bSmrg       fprintf (dump_file, "    prefetch latency: %d\n", param_prefetch_latency);
20041debfc3dSmrg       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
20051debfc3dSmrg       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
2006*8feb0f0bSmrg 	       L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
2007*8feb0f0bSmrg 	       param_l1_cache_size);
2008*8feb0f0bSmrg       fprintf (dump_file, "    L1 cache line size: %d\n",
2009*8feb0f0bSmrg 	       param_l1_cache_line_size);
2010*8feb0f0bSmrg       fprintf (dump_file, "    L2 cache size: %d kB\n", param_l2_cache_size);
20111debfc3dSmrg       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
2012*8feb0f0bSmrg 	       param_min_insn_to_prefetch_ratio);
20131debfc3dSmrg       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
2014*8feb0f0bSmrg 	       param_prefetch_min_insn_to_mem_ratio);
20151debfc3dSmrg       fprintf (dump_file, "\n");
20161debfc3dSmrg     }
20171debfc3dSmrg 
20181debfc3dSmrg   initialize_original_copy_tables ();
20191debfc3dSmrg 
20201debfc3dSmrg   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
20211debfc3dSmrg     {
20221debfc3dSmrg       tree type = build_function_type_list (void_type_node,
20231debfc3dSmrg 					    const_ptr_type_node, NULL_TREE);
20241debfc3dSmrg       tree decl = add_builtin_function ("__builtin_prefetch", type,
20251debfc3dSmrg 					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
20261debfc3dSmrg 					NULL, NULL_TREE);
20271debfc3dSmrg       DECL_IS_NOVOPS (decl) = true;
20281debfc3dSmrg       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
20291debfc3dSmrg     }
20301debfc3dSmrg 
20311debfc3dSmrg   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
20321debfc3dSmrg     {
20331debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
20341debfc3dSmrg 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
20351debfc3dSmrg 
20361debfc3dSmrg       unrolled |= loop_prefetch_arrays (loop);
20371debfc3dSmrg 
20381debfc3dSmrg       if (dump_file && (dump_flags & TDF_DETAILS))
20391debfc3dSmrg 	fprintf (dump_file, "\n\n");
20401debfc3dSmrg     }
20411debfc3dSmrg 
20421debfc3dSmrg   if (unrolled)
20431debfc3dSmrg     {
20441debfc3dSmrg       scev_reset ();
20451debfc3dSmrg       todo_flags |= TODO_cleanup_cfg;
20461debfc3dSmrg     }
20471debfc3dSmrg 
20481debfc3dSmrg   free_original_copy_tables ();
20491debfc3dSmrg   return todo_flags;
20501debfc3dSmrg }
20511debfc3dSmrg 
20521debfc3dSmrg /* Prefetching.  */
20531debfc3dSmrg 
20541debfc3dSmrg namespace {
20551debfc3dSmrg 
20561debfc3dSmrg const pass_data pass_data_loop_prefetch =
20571debfc3dSmrg {
20581debfc3dSmrg   GIMPLE_PASS, /* type */
20591debfc3dSmrg   "aprefetch", /* name */
20601debfc3dSmrg   OPTGROUP_LOOP, /* optinfo_flags */
20611debfc3dSmrg   TV_TREE_PREFETCH, /* tv_id */
20621debfc3dSmrg   ( PROP_cfg | PROP_ssa ), /* properties_required */
20631debfc3dSmrg   0, /* properties_provided */
20641debfc3dSmrg   0, /* properties_destroyed */
20651debfc3dSmrg   0, /* todo_flags_start */
20661debfc3dSmrg   0, /* todo_flags_finish */
20671debfc3dSmrg };
20681debfc3dSmrg 
20691debfc3dSmrg class pass_loop_prefetch : public gimple_opt_pass
20701debfc3dSmrg {
20711debfc3dSmrg public:
pass_loop_prefetch(gcc::context * ctxt)20721debfc3dSmrg   pass_loop_prefetch (gcc::context *ctxt)
20731debfc3dSmrg     : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
20741debfc3dSmrg   {}
20751debfc3dSmrg 
20761debfc3dSmrg   /* opt_pass methods: */
gate(function *)20771debfc3dSmrg   virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
20781debfc3dSmrg   virtual unsigned int execute (function *);
20791debfc3dSmrg 
20801debfc3dSmrg }; // class pass_loop_prefetch
20811debfc3dSmrg 
20821debfc3dSmrg unsigned int
execute(function * fun)20831debfc3dSmrg pass_loop_prefetch::execute (function *fun)
20841debfc3dSmrg {
20851debfc3dSmrg   if (number_of_loops (fun) <= 1)
20861debfc3dSmrg     return 0;
20871debfc3dSmrg 
20881debfc3dSmrg   if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
20891debfc3dSmrg     {
20901debfc3dSmrg       static bool warned = false;
20911debfc3dSmrg 
20921debfc3dSmrg       if (!warned)
20931debfc3dSmrg 	{
20941debfc3dSmrg 	  warning (OPT_Wdisabled_optimization,
20951debfc3dSmrg 		   "%<l1-cache-size%> parameter is not a power of two %d",
20961debfc3dSmrg 		   PREFETCH_BLOCK);
20971debfc3dSmrg 	  warned = true;
20981debfc3dSmrg 	}
20991debfc3dSmrg       return 0;
21001debfc3dSmrg     }
21011debfc3dSmrg 
21021debfc3dSmrg   return tree_ssa_prefetch_arrays ();
21031debfc3dSmrg }
21041debfc3dSmrg 
21051debfc3dSmrg } // anon namespace
21061debfc3dSmrg 
21071debfc3dSmrg gimple_opt_pass *
make_pass_loop_prefetch(gcc::context * ctxt)21081debfc3dSmrg make_pass_loop_prefetch (gcc::context *ctxt)
21091debfc3dSmrg {
21101debfc3dSmrg   return new pass_loop_prefetch (ctxt);
21111debfc3dSmrg }
21121debfc3dSmrg 
21131debfc3dSmrg 
2114