11debfc3dSmrg /* Array prefetching.
2*8feb0f0bSmrg Copyright (C) 2005-2020 Free Software Foundation, Inc.
31debfc3dSmrg
41debfc3dSmrg This file is part of GCC.
51debfc3dSmrg
61debfc3dSmrg GCC is free software; you can redistribute it and/or modify it
71debfc3dSmrg under the terms of the GNU General Public License as published by the
81debfc3dSmrg Free Software Foundation; either version 3, or (at your option) any
91debfc3dSmrg later version.
101debfc3dSmrg
111debfc3dSmrg GCC is distributed in the hope that it will be useful, but WITHOUT
121debfc3dSmrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
131debfc3dSmrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
141debfc3dSmrg for more details.
151debfc3dSmrg
161debfc3dSmrg You should have received a copy of the GNU General Public License
171debfc3dSmrg along with GCC; see the file COPYING3. If not see
181debfc3dSmrg <http://www.gnu.org/licenses/>. */
191debfc3dSmrg
201debfc3dSmrg #include "config.h"
211debfc3dSmrg #include "system.h"
221debfc3dSmrg #include "coretypes.h"
231debfc3dSmrg #include "backend.h"
241debfc3dSmrg #include "target.h"
251debfc3dSmrg #include "rtl.h"
261debfc3dSmrg #include "tree.h"
271debfc3dSmrg #include "gimple.h"
281debfc3dSmrg #include "predict.h"
291debfc3dSmrg #include "tree-pass.h"
301debfc3dSmrg #include "gimple-ssa.h"
311debfc3dSmrg #include "optabs-query.h"
321debfc3dSmrg #include "tree-pretty-print.h"
331debfc3dSmrg #include "fold-const.h"
341debfc3dSmrg #include "stor-layout.h"
351debfc3dSmrg #include "gimplify.h"
361debfc3dSmrg #include "gimple-iterator.h"
371debfc3dSmrg #include "gimplify-me.h"
381debfc3dSmrg #include "tree-ssa-loop-ivopts.h"
391debfc3dSmrg #include "tree-ssa-loop-manip.h"
401debfc3dSmrg #include "tree-ssa-loop-niter.h"
411debfc3dSmrg #include "tree-ssa-loop.h"
421debfc3dSmrg #include "ssa.h"
431debfc3dSmrg #include "tree-into-ssa.h"
441debfc3dSmrg #include "cfgloop.h"
451debfc3dSmrg #include "tree-scalar-evolution.h"
461debfc3dSmrg #include "langhooks.h"
471debfc3dSmrg #include "tree-inline.h"
481debfc3dSmrg #include "tree-data-ref.h"
491debfc3dSmrg #include "diagnostic-core.h"
50a2dc1f3fSmrg #include "dbgcnt.h"
511debfc3dSmrg
521debfc3dSmrg /* This pass inserts prefetch instructions to optimize cache usage during
531debfc3dSmrg accesses to arrays in loops. It processes loops sequentially and:
541debfc3dSmrg
551debfc3dSmrg 1) Gathers all memory references in the single loop.
561debfc3dSmrg 2) For each of the references it decides when it is profitable to prefetch
571debfc3dSmrg it. To do it, we evaluate the reuse among the accesses, and determines
581debfc3dSmrg two values: PREFETCH_BEFORE (meaning that it only makes sense to do
591debfc3dSmrg prefetching in the first PREFETCH_BEFORE iterations of the loop) and
601debfc3dSmrg PREFETCH_MOD (meaning that it only makes sense to prefetch in the
611debfc3dSmrg iterations of the loop that are zero modulo PREFETCH_MOD). For example
621debfc3dSmrg (assuming cache line size is 64 bytes, char has size 1 byte and there
631debfc3dSmrg is no hardware sequential prefetch):
641debfc3dSmrg
651debfc3dSmrg char *a;
661debfc3dSmrg for (i = 0; i < max; i++)
671debfc3dSmrg {
681debfc3dSmrg a[255] = ...; (0)
691debfc3dSmrg a[i] = ...; (1)
701debfc3dSmrg a[i + 64] = ...; (2)
711debfc3dSmrg a[16*i] = ...; (3)
721debfc3dSmrg a[187*i] = ...; (4)
731debfc3dSmrg a[187*i + 50] = ...; (5)
741debfc3dSmrg }
751debfc3dSmrg
761debfc3dSmrg (0) obviously has PREFETCH_BEFORE 1
771debfc3dSmrg (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
781debfc3dSmrg location 64 iterations before it, and PREFETCH_MOD 64 (since
791debfc3dSmrg it hits the same cache line otherwise).
801debfc3dSmrg (2) has PREFETCH_MOD 64
811debfc3dSmrg (3) has PREFETCH_MOD 4
821debfc3dSmrg (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
831debfc3dSmrg the cache line accessed by (5) is the same with probability only
841debfc3dSmrg 7/32.
851debfc3dSmrg (5) has PREFETCH_MOD 1 as well.
861debfc3dSmrg
871debfc3dSmrg Additionally, we use data dependence analysis to determine for each
881debfc3dSmrg reference the distance till the first reuse; this information is used
891debfc3dSmrg to determine the temporality of the issued prefetch instruction.
901debfc3dSmrg
911debfc3dSmrg 3) We determine how much ahead we need to prefetch. The number of
921debfc3dSmrg iterations needed is time to fetch / time spent in one iteration of
931debfc3dSmrg the loop. The problem is that we do not know either of these values,
941debfc3dSmrg so we just make a heuristic guess based on a magic (possibly)
951debfc3dSmrg target-specific constant and size of the loop.
961debfc3dSmrg
971debfc3dSmrg 4) Determine which of the references we prefetch. We take into account
981debfc3dSmrg that there is a maximum number of simultaneous prefetches (provided
991debfc3dSmrg by machine description). We prefetch as many prefetches as possible
1001debfc3dSmrg while still within this bound (starting with those with lowest
1011debfc3dSmrg prefetch_mod, since they are responsible for most of the cache
1021debfc3dSmrg misses).
1031debfc3dSmrg
1041debfc3dSmrg 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
1051debfc3dSmrg and PREFETCH_BEFORE requirements (within some bounds), and to avoid
1061debfc3dSmrg prefetching nonaccessed memory.
1071debfc3dSmrg TODO -- actually implement peeling.
1081debfc3dSmrg
1091debfc3dSmrg 6) We actually emit the prefetch instructions. ??? Perhaps emit the
1101debfc3dSmrg prefetch instructions with guards in cases where 5) was not sufficient
1111debfc3dSmrg to satisfy the constraints?
1121debfc3dSmrg
1131debfc3dSmrg A cost model is implemented to determine whether or not prefetching is
1141debfc3dSmrg profitable for a given loop. The cost model has three heuristics:
1151debfc3dSmrg
1161debfc3dSmrg 1. Function trip_count_to_ahead_ratio_too_small_p implements a
1171debfc3dSmrg heuristic that determines whether or not the loop has too few
1181debfc3dSmrg iterations (compared to ahead). Prefetching is not likely to be
1191debfc3dSmrg beneficial if the trip count to ahead ratio is below a certain
1201debfc3dSmrg minimum.
1211debfc3dSmrg
1221debfc3dSmrg 2. Function mem_ref_count_reasonable_p implements a heuristic that
1231debfc3dSmrg determines whether the given loop has enough CPU ops that can be
1241debfc3dSmrg overlapped with cache missing memory ops. If not, the loop
1251debfc3dSmrg won't benefit from prefetching. In the implementation,
1261debfc3dSmrg prefetching is not considered beneficial if the ratio between
1271debfc3dSmrg the instruction count and the mem ref count is below a certain
1281debfc3dSmrg minimum.
1291debfc3dSmrg
1301debfc3dSmrg 3. Function insn_to_prefetch_ratio_too_small_p implements a
1311debfc3dSmrg heuristic that disables prefetching in a loop if the prefetching
1321debfc3dSmrg cost is above a certain limit. The relative prefetching cost is
1331debfc3dSmrg estimated by taking the ratio between the prefetch count and the
1341debfc3dSmrg total intruction count (this models the I-cache cost).
1351debfc3dSmrg
1361debfc3dSmrg The limits used in these heuristics are defined as parameters with
1371debfc3dSmrg reasonable default values. Machine-specific default values will be
1381debfc3dSmrg added later.
1391debfc3dSmrg
1401debfc3dSmrg Some other TODO:
1411debfc3dSmrg -- write and use more general reuse analysis (that could be also used
1421debfc3dSmrg in other cache aimed loop optimizations)
1431debfc3dSmrg -- make it behave sanely together with the prefetches given by user
1441debfc3dSmrg (now we just ignore them; at the very least we should avoid
1451debfc3dSmrg optimizing loops in that user put his own prefetches)
1461debfc3dSmrg -- we assume cache line size alignment of arrays; this could be
1471debfc3dSmrg improved. */
1481debfc3dSmrg
1491debfc3dSmrg /* Magic constants follow. These should be replaced by machine specific
1501debfc3dSmrg numbers. */
1511debfc3dSmrg
1521debfc3dSmrg /* True if write can be prefetched by a read prefetch. */
1531debfc3dSmrg
1541debfc3dSmrg #ifndef WRITE_CAN_USE_READ_PREFETCH
1551debfc3dSmrg #define WRITE_CAN_USE_READ_PREFETCH 1
1561debfc3dSmrg #endif
1571debfc3dSmrg
1581debfc3dSmrg /* True if read can be prefetched by a write prefetch. */
1591debfc3dSmrg
1601debfc3dSmrg #ifndef READ_CAN_USE_WRITE_PREFETCH
1611debfc3dSmrg #define READ_CAN_USE_WRITE_PREFETCH 0
1621debfc3dSmrg #endif
1631debfc3dSmrg
1641debfc3dSmrg /* The size of the block loaded by a single prefetch. Usually, this is
1651debfc3dSmrg the same as cache line size (at the moment, we only consider one level
1661debfc3dSmrg of cache hierarchy). */
1671debfc3dSmrg
1681debfc3dSmrg #ifndef PREFETCH_BLOCK
169*8feb0f0bSmrg #define PREFETCH_BLOCK param_l1_cache_line_size
1701debfc3dSmrg #endif
1711debfc3dSmrg
1721debfc3dSmrg /* Do we have a forward hardware sequential prefetching? */
1731debfc3dSmrg
1741debfc3dSmrg #ifndef HAVE_FORWARD_PREFETCH
1751debfc3dSmrg #define HAVE_FORWARD_PREFETCH 0
1761debfc3dSmrg #endif
1771debfc3dSmrg
1781debfc3dSmrg /* Do we have a backward hardware sequential prefetching? */
1791debfc3dSmrg
1801debfc3dSmrg #ifndef HAVE_BACKWARD_PREFETCH
1811debfc3dSmrg #define HAVE_BACKWARD_PREFETCH 0
1821debfc3dSmrg #endif
1831debfc3dSmrg
1841debfc3dSmrg /* In some cases we are only able to determine that there is a certain
1851debfc3dSmrg probability that the two accesses hit the same cache line. In this
1861debfc3dSmrg case, we issue the prefetches for both of them if this probability
1871debfc3dSmrg is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
1881debfc3dSmrg
1891debfc3dSmrg #ifndef ACCEPTABLE_MISS_RATE
1901debfc3dSmrg #define ACCEPTABLE_MISS_RATE 50
1911debfc3dSmrg #endif
1921debfc3dSmrg
193*8feb0f0bSmrg #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
194*8feb0f0bSmrg #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
1951debfc3dSmrg
1961debfc3dSmrg /* We consider a memory access nontemporal if it is not reused sooner than
1971debfc3dSmrg after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
1981debfc3dSmrg accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1991debfc3dSmrg so that we use nontemporal prefetches e.g. if single memory location
2001debfc3dSmrg is accessed several times in a single iteration of the loop. */
2011debfc3dSmrg #define NONTEMPORAL_FRACTION 16
2021debfc3dSmrg
2031debfc3dSmrg /* In case we have to emit a memory fence instruction after the loop that
2041debfc3dSmrg uses nontemporal stores, this defines the builtin to use. */
2051debfc3dSmrg
2061debfc3dSmrg #ifndef FENCE_FOLLOWING_MOVNT
2071debfc3dSmrg #define FENCE_FOLLOWING_MOVNT NULL_TREE
2081debfc3dSmrg #endif
2091debfc3dSmrg
2101debfc3dSmrg /* It is not profitable to prefetch when the trip count is not at
2111debfc3dSmrg least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
2121debfc3dSmrg For example, in a loop with a prefetch ahead distance of 10,
2131debfc3dSmrg supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
2141debfc3dSmrg profitable to prefetch when the trip count is greater or equal to
2151debfc3dSmrg 40. In that case, 30 out of the 40 iterations will benefit from
2161debfc3dSmrg prefetching. */
2171debfc3dSmrg
2181debfc3dSmrg #ifndef TRIP_COUNT_TO_AHEAD_RATIO
2191debfc3dSmrg #define TRIP_COUNT_TO_AHEAD_RATIO 4
2201debfc3dSmrg #endif
2211debfc3dSmrg
2221debfc3dSmrg /* The group of references between that reuse may occur. */
2231debfc3dSmrg
2241debfc3dSmrg struct mem_ref_group
2251debfc3dSmrg {
2261debfc3dSmrg tree base; /* Base of the reference. */
2271debfc3dSmrg tree step; /* Step of the reference. */
2281debfc3dSmrg struct mem_ref *refs; /* References in the group. */
2291debfc3dSmrg struct mem_ref_group *next; /* Next group of references. */
230a2dc1f3fSmrg unsigned int uid; /* Group UID, used only for debugging. */
2311debfc3dSmrg };
2321debfc3dSmrg
2331debfc3dSmrg /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
2341debfc3dSmrg
2351debfc3dSmrg #define PREFETCH_ALL HOST_WIDE_INT_M1U
2361debfc3dSmrg
2371debfc3dSmrg /* Do not generate a prefetch if the unroll factor is significantly less
2381debfc3dSmrg than what is required by the prefetch. This is to avoid redundant
2391debfc3dSmrg prefetches. For example, when prefetch_mod is 16 and unroll_factor is
2401debfc3dSmrg 2, prefetching requires unrolling the loop 16 times, but
2411debfc3dSmrg the loop is actually unrolled twice. In this case (ratio = 8),
2421debfc3dSmrg prefetching is not likely to be beneficial. */
2431debfc3dSmrg
2441debfc3dSmrg #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
2451debfc3dSmrg #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
2461debfc3dSmrg #endif
2471debfc3dSmrg
2481debfc3dSmrg /* Some of the prefetch computations have quadratic complexity. We want to
2491debfc3dSmrg avoid huge compile times and, therefore, want to limit the amount of
2501debfc3dSmrg memory references per loop where we consider prefetching. */
2511debfc3dSmrg
2521debfc3dSmrg #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
2531debfc3dSmrg #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
2541debfc3dSmrg #endif
2551debfc3dSmrg
2561debfc3dSmrg /* The memory reference. */
2571debfc3dSmrg
2581debfc3dSmrg struct mem_ref
2591debfc3dSmrg {
2601debfc3dSmrg gimple *stmt; /* Statement in that the reference appears. */
2611debfc3dSmrg tree mem; /* The reference. */
2621debfc3dSmrg HOST_WIDE_INT delta; /* Constant offset of the reference. */
2631debfc3dSmrg struct mem_ref_group *group; /* The group of references it belongs to. */
2641debfc3dSmrg unsigned HOST_WIDE_INT prefetch_mod;
2651debfc3dSmrg /* Prefetch only each PREFETCH_MOD-th
2661debfc3dSmrg iteration. */
2671debfc3dSmrg unsigned HOST_WIDE_INT prefetch_before;
2681debfc3dSmrg /* Prefetch only first PREFETCH_BEFORE
2691debfc3dSmrg iterations. */
2701debfc3dSmrg unsigned reuse_distance; /* The amount of data accessed before the first
2711debfc3dSmrg reuse of this value. */
2721debfc3dSmrg struct mem_ref *next; /* The next reference in the group. */
273a2dc1f3fSmrg unsigned int uid; /* Ref UID, used only for debugging. */
2741debfc3dSmrg unsigned write_p : 1; /* Is it a write? */
2751debfc3dSmrg unsigned independent_p : 1; /* True if the reference is independent on
2761debfc3dSmrg all other references inside the loop. */
2771debfc3dSmrg unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
2781debfc3dSmrg unsigned storent_p : 1; /* True if we changed the store to a
2791debfc3dSmrg nontemporal one. */
2801debfc3dSmrg };
2811debfc3dSmrg
2821debfc3dSmrg /* Dumps information about memory reference */
2831debfc3dSmrg static void
dump_mem_details(FILE * file,tree base,tree step,HOST_WIDE_INT delta,bool write_p)2841debfc3dSmrg dump_mem_details (FILE *file, tree base, tree step,
2851debfc3dSmrg HOST_WIDE_INT delta, bool write_p)
2861debfc3dSmrg {
2871debfc3dSmrg fprintf (file, "(base ");
2881debfc3dSmrg print_generic_expr (file, base, TDF_SLIM);
2891debfc3dSmrg fprintf (file, ", step ");
2901debfc3dSmrg if (cst_and_fits_in_hwi (step))
2911debfc3dSmrg fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
2921debfc3dSmrg else
293a2dc1f3fSmrg print_generic_expr (file, step, TDF_SLIM);
2941debfc3dSmrg fprintf (file, ")\n");
295a2dc1f3fSmrg fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
296a2dc1f3fSmrg fprintf (file, " %s\n\n", write_p ? "write" : "read");
2971debfc3dSmrg }
2981debfc3dSmrg
2991debfc3dSmrg /* Dumps information about reference REF to FILE. */
3001debfc3dSmrg
3011debfc3dSmrg static void
dump_mem_ref(FILE * file,struct mem_ref * ref)3021debfc3dSmrg dump_mem_ref (FILE *file, struct mem_ref *ref)
3031debfc3dSmrg {
304a2dc1f3fSmrg fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
305a2dc1f3fSmrg print_generic_expr (file, ref->mem, TDF_SLIM);
306a2dc1f3fSmrg fprintf (file, ")\n");
3071debfc3dSmrg }
3081debfc3dSmrg
3091debfc3dSmrg /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
3101debfc3dSmrg exist. */
3111debfc3dSmrg
3121debfc3dSmrg static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,tree step)3131debfc3dSmrg find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
3141debfc3dSmrg {
315a2dc1f3fSmrg /* Global count for setting struct mem_ref_group->uid. */
316a2dc1f3fSmrg static unsigned int last_mem_ref_group_uid = 0;
317a2dc1f3fSmrg
3181debfc3dSmrg struct mem_ref_group *group;
3191debfc3dSmrg
3201debfc3dSmrg for (; *groups; groups = &(*groups)->next)
3211debfc3dSmrg {
3221debfc3dSmrg if (operand_equal_p ((*groups)->step, step, 0)
3231debfc3dSmrg && operand_equal_p ((*groups)->base, base, 0))
3241debfc3dSmrg return *groups;
3251debfc3dSmrg
3261debfc3dSmrg /* If step is an integer constant, keep the list of groups sorted
3271debfc3dSmrg by decreasing step. */
3281debfc3dSmrg if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
3291debfc3dSmrg && int_cst_value ((*groups)->step) < int_cst_value (step))
3301debfc3dSmrg break;
3311debfc3dSmrg }
3321debfc3dSmrg
3331debfc3dSmrg group = XNEW (struct mem_ref_group);
3341debfc3dSmrg group->base = base;
3351debfc3dSmrg group->step = step;
3361debfc3dSmrg group->refs = NULL;
337a2dc1f3fSmrg group->uid = ++last_mem_ref_group_uid;
3381debfc3dSmrg group->next = *groups;
3391debfc3dSmrg *groups = group;
3401debfc3dSmrg
3411debfc3dSmrg return group;
3421debfc3dSmrg }
3431debfc3dSmrg
3441debfc3dSmrg /* Records a memory reference MEM in GROUP with offset DELTA and write status
3451debfc3dSmrg WRITE_P. The reference occurs in statement STMT. */
3461debfc3dSmrg
3471debfc3dSmrg static void
record_ref(struct mem_ref_group * group,gimple * stmt,tree mem,HOST_WIDE_INT delta,bool write_p)3481debfc3dSmrg record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
3491debfc3dSmrg HOST_WIDE_INT delta, bool write_p)
3501debfc3dSmrg {
351a2dc1f3fSmrg unsigned int last_mem_ref_uid = 0;
3521debfc3dSmrg struct mem_ref **aref;
3531debfc3dSmrg
3541debfc3dSmrg /* Do not record the same address twice. */
3551debfc3dSmrg for (aref = &group->refs; *aref; aref = &(*aref)->next)
3561debfc3dSmrg {
357a2dc1f3fSmrg last_mem_ref_uid = (*aref)->uid;
358a2dc1f3fSmrg
3591debfc3dSmrg /* It does not have to be possible for write reference to reuse the read
3601debfc3dSmrg prefetch, or vice versa. */
3611debfc3dSmrg if (!WRITE_CAN_USE_READ_PREFETCH
3621debfc3dSmrg && write_p
3631debfc3dSmrg && !(*aref)->write_p)
3641debfc3dSmrg continue;
3651debfc3dSmrg if (!READ_CAN_USE_WRITE_PREFETCH
3661debfc3dSmrg && !write_p
3671debfc3dSmrg && (*aref)->write_p)
3681debfc3dSmrg continue;
3691debfc3dSmrg
3701debfc3dSmrg if ((*aref)->delta == delta)
3711debfc3dSmrg return;
3721debfc3dSmrg }
3731debfc3dSmrg
3741debfc3dSmrg (*aref) = XNEW (struct mem_ref);
3751debfc3dSmrg (*aref)->stmt = stmt;
3761debfc3dSmrg (*aref)->mem = mem;
3771debfc3dSmrg (*aref)->delta = delta;
3781debfc3dSmrg (*aref)->write_p = write_p;
3791debfc3dSmrg (*aref)->prefetch_before = PREFETCH_ALL;
3801debfc3dSmrg (*aref)->prefetch_mod = 1;
3811debfc3dSmrg (*aref)->reuse_distance = 0;
3821debfc3dSmrg (*aref)->issue_prefetch_p = false;
3831debfc3dSmrg (*aref)->group = group;
3841debfc3dSmrg (*aref)->next = NULL;
3851debfc3dSmrg (*aref)->independent_p = false;
3861debfc3dSmrg (*aref)->storent_p = false;
387a2dc1f3fSmrg (*aref)->uid = last_mem_ref_uid + 1;
3881debfc3dSmrg
3891debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
390a2dc1f3fSmrg {
3911debfc3dSmrg dump_mem_ref (dump_file, *aref);
392a2dc1f3fSmrg
393a2dc1f3fSmrg fprintf (dump_file, " group %u ", group->uid);
394a2dc1f3fSmrg dump_mem_details (dump_file, group->base, group->step, delta,
395a2dc1f3fSmrg write_p);
396a2dc1f3fSmrg }
3971debfc3dSmrg }
3981debfc3dSmrg
3991debfc3dSmrg /* Release memory references in GROUPS. */
4001debfc3dSmrg
4011debfc3dSmrg static void
release_mem_refs(struct mem_ref_group * groups)4021debfc3dSmrg release_mem_refs (struct mem_ref_group *groups)
4031debfc3dSmrg {
4041debfc3dSmrg struct mem_ref_group *next_g;
4051debfc3dSmrg struct mem_ref *ref, *next_r;
4061debfc3dSmrg
4071debfc3dSmrg for (; groups; groups = next_g)
4081debfc3dSmrg {
4091debfc3dSmrg next_g = groups->next;
4101debfc3dSmrg for (ref = groups->refs; ref; ref = next_r)
4111debfc3dSmrg {
4121debfc3dSmrg next_r = ref->next;
4131debfc3dSmrg free (ref);
4141debfc3dSmrg }
4151debfc3dSmrg free (groups);
4161debfc3dSmrg }
4171debfc3dSmrg }
4181debfc3dSmrg
4191debfc3dSmrg /* A structure used to pass arguments to idx_analyze_ref. */
4201debfc3dSmrg
4211debfc3dSmrg struct ar_data
4221debfc3dSmrg {
423*8feb0f0bSmrg class loop *loop; /* Loop of the reference. */
4241debfc3dSmrg gimple *stmt; /* Statement of the reference. */
4251debfc3dSmrg tree *step; /* Step of the memory reference. */
4261debfc3dSmrg HOST_WIDE_INT *delta; /* Offset of the memory reference. */
4271debfc3dSmrg };
4281debfc3dSmrg
4291debfc3dSmrg /* Analyzes a single INDEX of a memory reference to obtain information
4301debfc3dSmrg described at analyze_ref. Callback for for_each_index. */
4311debfc3dSmrg
4321debfc3dSmrg static bool
idx_analyze_ref(tree base,tree * index,void * data)4331debfc3dSmrg idx_analyze_ref (tree base, tree *index, void *data)
4341debfc3dSmrg {
4351debfc3dSmrg struct ar_data *ar_data = (struct ar_data *) data;
4361debfc3dSmrg tree ibase, step, stepsize;
4371debfc3dSmrg HOST_WIDE_INT idelta = 0, imult = 1;
4381debfc3dSmrg affine_iv iv;
4391debfc3dSmrg
4401debfc3dSmrg if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
4411debfc3dSmrg *index, &iv, true))
4421debfc3dSmrg return false;
4431debfc3dSmrg ibase = iv.base;
4441debfc3dSmrg step = iv.step;
4451debfc3dSmrg
4461debfc3dSmrg if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
4471debfc3dSmrg && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
4481debfc3dSmrg {
4491debfc3dSmrg idelta = int_cst_value (TREE_OPERAND (ibase, 1));
4501debfc3dSmrg ibase = TREE_OPERAND (ibase, 0);
4511debfc3dSmrg }
4521debfc3dSmrg if (cst_and_fits_in_hwi (ibase))
4531debfc3dSmrg {
4541debfc3dSmrg idelta += int_cst_value (ibase);
4551debfc3dSmrg ibase = build_int_cst (TREE_TYPE (ibase), 0);
4561debfc3dSmrg }
4571debfc3dSmrg
4581debfc3dSmrg if (TREE_CODE (base) == ARRAY_REF)
4591debfc3dSmrg {
4601debfc3dSmrg stepsize = array_ref_element_size (base);
4611debfc3dSmrg if (!cst_and_fits_in_hwi (stepsize))
4621debfc3dSmrg return false;
4631debfc3dSmrg imult = int_cst_value (stepsize);
4641debfc3dSmrg step = fold_build2 (MULT_EXPR, sizetype,
4651debfc3dSmrg fold_convert (sizetype, step),
4661debfc3dSmrg fold_convert (sizetype, stepsize));
4671debfc3dSmrg idelta *= imult;
4681debfc3dSmrg }
4691debfc3dSmrg
4701debfc3dSmrg if (*ar_data->step == NULL_TREE)
4711debfc3dSmrg *ar_data->step = step;
4721debfc3dSmrg else
4731debfc3dSmrg *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
4741debfc3dSmrg fold_convert (sizetype, *ar_data->step),
4751debfc3dSmrg fold_convert (sizetype, step));
4761debfc3dSmrg *ar_data->delta += idelta;
4771debfc3dSmrg *index = ibase;
4781debfc3dSmrg
4791debfc3dSmrg return true;
4801debfc3dSmrg }
4811debfc3dSmrg
4821debfc3dSmrg /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
4831debfc3dSmrg STEP are integer constants and iter is number of iterations of LOOP. The
4841debfc3dSmrg reference occurs in statement STMT. Strips nonaddressable component
4851debfc3dSmrg references from REF_P. */
4861debfc3dSmrg
4871debfc3dSmrg static bool
analyze_ref(class loop * loop,tree * ref_p,tree * base,tree * step,HOST_WIDE_INT * delta,gimple * stmt)488*8feb0f0bSmrg analyze_ref (class loop *loop, tree *ref_p, tree *base,
4891debfc3dSmrg tree *step, HOST_WIDE_INT *delta,
4901debfc3dSmrg gimple *stmt)
4911debfc3dSmrg {
4921debfc3dSmrg struct ar_data ar_data;
4931debfc3dSmrg tree off;
4941debfc3dSmrg HOST_WIDE_INT bit_offset;
4951debfc3dSmrg tree ref = *ref_p;
4961debfc3dSmrg
4971debfc3dSmrg *step = NULL_TREE;
4981debfc3dSmrg *delta = 0;
4991debfc3dSmrg
5001debfc3dSmrg /* First strip off the component references. Ignore bitfields.
5011debfc3dSmrg Also strip off the real and imagine parts of a complex, so that
5021debfc3dSmrg they can have the same base. */
5031debfc3dSmrg if (TREE_CODE (ref) == REALPART_EXPR
5041debfc3dSmrg || TREE_CODE (ref) == IMAGPART_EXPR
5051debfc3dSmrg || (TREE_CODE (ref) == COMPONENT_REF
5061debfc3dSmrg && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
5071debfc3dSmrg {
5081debfc3dSmrg if (TREE_CODE (ref) == IMAGPART_EXPR)
5091debfc3dSmrg *delta += int_size_in_bytes (TREE_TYPE (ref));
5101debfc3dSmrg ref = TREE_OPERAND (ref, 0);
5111debfc3dSmrg }
5121debfc3dSmrg
5131debfc3dSmrg *ref_p = ref;
5141debfc3dSmrg
5151debfc3dSmrg for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
5161debfc3dSmrg {
5171debfc3dSmrg off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
5181debfc3dSmrg bit_offset = TREE_INT_CST_LOW (off);
5191debfc3dSmrg gcc_assert (bit_offset % BITS_PER_UNIT == 0);
5201debfc3dSmrg
5211debfc3dSmrg *delta += bit_offset / BITS_PER_UNIT;
5221debfc3dSmrg }
5231debfc3dSmrg
5241debfc3dSmrg *base = unshare_expr (ref);
5251debfc3dSmrg ar_data.loop = loop;
5261debfc3dSmrg ar_data.stmt = stmt;
5271debfc3dSmrg ar_data.step = step;
5281debfc3dSmrg ar_data.delta = delta;
5291debfc3dSmrg return for_each_index (base, idx_analyze_ref, &ar_data);
5301debfc3dSmrg }
5311debfc3dSmrg
5321debfc3dSmrg /* Record a memory reference REF to the list REFS. The reference occurs in
5331debfc3dSmrg LOOP in statement STMT and it is write if WRITE_P. Returns true if the
5341debfc3dSmrg reference was recorded, false otherwise. */
5351debfc3dSmrg
5361debfc3dSmrg static bool
gather_memory_references_ref(class loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,gimple * stmt)537*8feb0f0bSmrg gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs,
5381debfc3dSmrg tree ref, bool write_p, gimple *stmt)
5391debfc3dSmrg {
5401debfc3dSmrg tree base, step;
5411debfc3dSmrg HOST_WIDE_INT delta;
5421debfc3dSmrg struct mem_ref_group *agrp;
5431debfc3dSmrg
5441debfc3dSmrg if (get_base_address (ref) == NULL)
5451debfc3dSmrg return false;
5461debfc3dSmrg
5471debfc3dSmrg if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
5481debfc3dSmrg return false;
5491debfc3dSmrg /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
5501debfc3dSmrg if (step == NULL_TREE)
5511debfc3dSmrg return false;
5521debfc3dSmrg
5531debfc3dSmrg /* Stop if the address of BASE could not be taken. */
5541debfc3dSmrg if (may_be_nonaddressable_p (base))
5551debfc3dSmrg return false;
5561debfc3dSmrg
5571debfc3dSmrg /* Limit non-constant step prefetching only to the innermost loops and
5581debfc3dSmrg only when the step is loop invariant in the entire loop nest. */
5591debfc3dSmrg if (!cst_and_fits_in_hwi (step))
5601debfc3dSmrg {
5611debfc3dSmrg if (loop->inner != NULL)
5621debfc3dSmrg {
5631debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
5641debfc3dSmrg {
5651debfc3dSmrg fprintf (dump_file, "Memory expression %p\n",(void *) ref );
566a2dc1f3fSmrg print_generic_expr (dump_file, ref, TDF_SLIM);
5671debfc3dSmrg fprintf (dump_file,":");
5681debfc3dSmrg dump_mem_details (dump_file, base, step, delta, write_p);
5691debfc3dSmrg fprintf (dump_file,
5701debfc3dSmrg "Ignoring %p, non-constant step prefetching is "
5711debfc3dSmrg "limited to inner most loops \n",
5721debfc3dSmrg (void *) ref);
5731debfc3dSmrg }
5741debfc3dSmrg return false;
5751debfc3dSmrg }
5761debfc3dSmrg else
5771debfc3dSmrg {
5781debfc3dSmrg if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
5791debfc3dSmrg {
5801debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
5811debfc3dSmrg {
5821debfc3dSmrg fprintf (dump_file, "Memory expression %p\n",(void *) ref );
583a2dc1f3fSmrg print_generic_expr (dump_file, ref, TDF_SLIM);
5841debfc3dSmrg fprintf (dump_file,":");
5851debfc3dSmrg dump_mem_details (dump_file, base, step, delta, write_p);
5861debfc3dSmrg fprintf (dump_file,
5871debfc3dSmrg "Not prefetching, ignoring %p due to "
5881debfc3dSmrg "loop variant step\n",
5891debfc3dSmrg (void *) ref);
5901debfc3dSmrg }
5911debfc3dSmrg return false;
5921debfc3dSmrg }
5931debfc3dSmrg }
5941debfc3dSmrg }
5951debfc3dSmrg
5961debfc3dSmrg /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
5971debfc3dSmrg are integer constants. */
5981debfc3dSmrg agrp = find_or_create_group (refs, base, step);
5991debfc3dSmrg record_ref (agrp, stmt, ref, delta, write_p);
6001debfc3dSmrg
6011debfc3dSmrg return true;
6021debfc3dSmrg }
6031debfc3dSmrg
6041debfc3dSmrg /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
6051debfc3dSmrg true if there are no other memory references inside the loop. */
6061debfc3dSmrg
6071debfc3dSmrg static struct mem_ref_group *
gather_memory_references(class loop * loop,bool * no_other_refs,unsigned * ref_count)608*8feb0f0bSmrg gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count)
6091debfc3dSmrg {
6101debfc3dSmrg basic_block *body = get_loop_body_in_dom_order (loop);
6111debfc3dSmrg basic_block bb;
6121debfc3dSmrg unsigned i;
6131debfc3dSmrg gimple_stmt_iterator bsi;
6141debfc3dSmrg gimple *stmt;
6151debfc3dSmrg tree lhs, rhs;
6161debfc3dSmrg struct mem_ref_group *refs = NULL;
6171debfc3dSmrg
6181debfc3dSmrg *no_other_refs = true;
6191debfc3dSmrg *ref_count = 0;
6201debfc3dSmrg
6211debfc3dSmrg /* Scan the loop body in order, so that the former references precede the
6221debfc3dSmrg later ones. */
6231debfc3dSmrg for (i = 0; i < loop->num_nodes; i++)
6241debfc3dSmrg {
6251debfc3dSmrg bb = body[i];
6261debfc3dSmrg if (bb->loop_father != loop)
6271debfc3dSmrg continue;
6281debfc3dSmrg
6291debfc3dSmrg for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
6301debfc3dSmrg {
6311debfc3dSmrg stmt = gsi_stmt (bsi);
6321debfc3dSmrg
6331debfc3dSmrg if (gimple_code (stmt) != GIMPLE_ASSIGN)
6341debfc3dSmrg {
6351debfc3dSmrg if (gimple_vuse (stmt)
6361debfc3dSmrg || (is_gimple_call (stmt)
6371debfc3dSmrg && !(gimple_call_flags (stmt) & ECF_CONST)))
6381debfc3dSmrg *no_other_refs = false;
6391debfc3dSmrg continue;
6401debfc3dSmrg }
6411debfc3dSmrg
6421debfc3dSmrg if (! gimple_vuse (stmt))
6431debfc3dSmrg continue;
6441debfc3dSmrg
6451debfc3dSmrg lhs = gimple_assign_lhs (stmt);
6461debfc3dSmrg rhs = gimple_assign_rhs1 (stmt);
6471debfc3dSmrg
6481debfc3dSmrg if (REFERENCE_CLASS_P (rhs))
6491debfc3dSmrg {
6501debfc3dSmrg *no_other_refs &= gather_memory_references_ref (loop, &refs,
6511debfc3dSmrg rhs, false, stmt);
6521debfc3dSmrg *ref_count += 1;
6531debfc3dSmrg }
6541debfc3dSmrg if (REFERENCE_CLASS_P (lhs))
6551debfc3dSmrg {
6561debfc3dSmrg *no_other_refs &= gather_memory_references_ref (loop, &refs,
6571debfc3dSmrg lhs, true, stmt);
6581debfc3dSmrg *ref_count += 1;
6591debfc3dSmrg }
6601debfc3dSmrg }
6611debfc3dSmrg }
6621debfc3dSmrg free (body);
6631debfc3dSmrg
6641debfc3dSmrg return refs;
6651debfc3dSmrg }
6661debfc3dSmrg
6671debfc3dSmrg /* Prune the prefetch candidate REF using the self-reuse. */
6681debfc3dSmrg
6691debfc3dSmrg static void
prune_ref_by_self_reuse(struct mem_ref * ref)6701debfc3dSmrg prune_ref_by_self_reuse (struct mem_ref *ref)
6711debfc3dSmrg {
6721debfc3dSmrg HOST_WIDE_INT step;
6731debfc3dSmrg bool backward;
6741debfc3dSmrg
6751debfc3dSmrg /* If the step size is non constant, we cannot calculate prefetch_mod. */
6761debfc3dSmrg if (!cst_and_fits_in_hwi (ref->group->step))
6771debfc3dSmrg return;
6781debfc3dSmrg
6791debfc3dSmrg step = int_cst_value (ref->group->step);
6801debfc3dSmrg
6811debfc3dSmrg backward = step < 0;
6821debfc3dSmrg
6831debfc3dSmrg if (step == 0)
6841debfc3dSmrg {
6851debfc3dSmrg /* Prefetch references to invariant address just once. */
6861debfc3dSmrg ref->prefetch_before = 1;
6871debfc3dSmrg return;
6881debfc3dSmrg }
6891debfc3dSmrg
6901debfc3dSmrg if (backward)
6911debfc3dSmrg step = -step;
6921debfc3dSmrg
6931debfc3dSmrg if (step > PREFETCH_BLOCK)
6941debfc3dSmrg return;
6951debfc3dSmrg
6961debfc3dSmrg if ((backward && HAVE_BACKWARD_PREFETCH)
6971debfc3dSmrg || (!backward && HAVE_FORWARD_PREFETCH))
6981debfc3dSmrg {
6991debfc3dSmrg ref->prefetch_before = 1;
7001debfc3dSmrg return;
7011debfc3dSmrg }
7021debfc3dSmrg
7031debfc3dSmrg ref->prefetch_mod = PREFETCH_BLOCK / step;
7041debfc3dSmrg }
7051debfc3dSmrg
7061debfc3dSmrg /* Divides X by BY, rounding down. */
7071debfc3dSmrg
7081debfc3dSmrg static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)7091debfc3dSmrg ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
7101debfc3dSmrg {
7111debfc3dSmrg gcc_assert (by > 0);
7121debfc3dSmrg
7131debfc3dSmrg if (x >= 0)
7141debfc3dSmrg return x / (HOST_WIDE_INT) by;
7151debfc3dSmrg else
7161debfc3dSmrg return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
7171debfc3dSmrg }
7181debfc3dSmrg
7191debfc3dSmrg /* Given a CACHE_LINE_SIZE and two inductive memory references
7201debfc3dSmrg with a common STEP greater than CACHE_LINE_SIZE and an address
7211debfc3dSmrg difference DELTA, compute the probability that they will fall
7221debfc3dSmrg in different cache lines. Return true if the computed miss rate
7231debfc3dSmrg is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
7241debfc3dSmrg number of distinct iterations after which the pattern repeats itself.
7251debfc3dSmrg ALIGN_UNIT is the unit of alignment in bytes. */
7261debfc3dSmrg
7271debfc3dSmrg static bool
is_miss_rate_acceptable(unsigned HOST_WIDE_INT cache_line_size,HOST_WIDE_INT step,HOST_WIDE_INT delta,unsigned HOST_WIDE_INT distinct_iters,int align_unit)7281debfc3dSmrg is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
7291debfc3dSmrg HOST_WIDE_INT step, HOST_WIDE_INT delta,
7301debfc3dSmrg unsigned HOST_WIDE_INT distinct_iters,
7311debfc3dSmrg int align_unit)
7321debfc3dSmrg {
7331debfc3dSmrg unsigned align, iter;
7341debfc3dSmrg int total_positions, miss_positions, max_allowed_miss_positions;
7351debfc3dSmrg int address1, address2, cache_line1, cache_line2;
7361debfc3dSmrg
7371debfc3dSmrg /* It always misses if delta is greater than or equal to the cache
7381debfc3dSmrg line size. */
7391debfc3dSmrg if (delta >= (HOST_WIDE_INT) cache_line_size)
7401debfc3dSmrg return false;
7411debfc3dSmrg
7421debfc3dSmrg miss_positions = 0;
7431debfc3dSmrg total_positions = (cache_line_size / align_unit) * distinct_iters;
7441debfc3dSmrg max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
7451debfc3dSmrg
7461debfc3dSmrg /* Iterate through all possible alignments of the first
7471debfc3dSmrg memory reference within its cache line. */
7481debfc3dSmrg for (align = 0; align < cache_line_size; align += align_unit)
7491debfc3dSmrg
7501debfc3dSmrg /* Iterate through all distinct iterations. */
7511debfc3dSmrg for (iter = 0; iter < distinct_iters; iter++)
7521debfc3dSmrg {
7531debfc3dSmrg address1 = align + step * iter;
7541debfc3dSmrg address2 = address1 + delta;
7551debfc3dSmrg cache_line1 = address1 / cache_line_size;
7561debfc3dSmrg cache_line2 = address2 / cache_line_size;
7571debfc3dSmrg if (cache_line1 != cache_line2)
7581debfc3dSmrg {
7591debfc3dSmrg miss_positions += 1;
7601debfc3dSmrg if (miss_positions > max_allowed_miss_positions)
7611debfc3dSmrg return false;
7621debfc3dSmrg }
7631debfc3dSmrg }
7641debfc3dSmrg return true;
7651debfc3dSmrg }
7661debfc3dSmrg
7671debfc3dSmrg /* Prune the prefetch candidate REF using the reuse with BY.
7681debfc3dSmrg If BY_IS_BEFORE is true, BY is before REF in the loop. */
7691debfc3dSmrg
7701debfc3dSmrg static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)7711debfc3dSmrg prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
7721debfc3dSmrg bool by_is_before)
7731debfc3dSmrg {
7741debfc3dSmrg HOST_WIDE_INT step;
7751debfc3dSmrg bool backward;
7761debfc3dSmrg HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
7771debfc3dSmrg HOST_WIDE_INT delta = delta_b - delta_r;
7781debfc3dSmrg HOST_WIDE_INT hit_from;
7791debfc3dSmrg unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
7801debfc3dSmrg HOST_WIDE_INT reduced_step;
7811debfc3dSmrg unsigned HOST_WIDE_INT reduced_prefetch_block;
7821debfc3dSmrg tree ref_type;
7831debfc3dSmrg int align_unit;
7841debfc3dSmrg
7851debfc3dSmrg /* If the step is non constant we cannot calculate prefetch_before. */
7861debfc3dSmrg if (!cst_and_fits_in_hwi (ref->group->step)) {
7871debfc3dSmrg return;
7881debfc3dSmrg }
7891debfc3dSmrg
7901debfc3dSmrg step = int_cst_value (ref->group->step);
7911debfc3dSmrg
7921debfc3dSmrg backward = step < 0;
7931debfc3dSmrg
7941debfc3dSmrg
7951debfc3dSmrg if (delta == 0)
7961debfc3dSmrg {
7971debfc3dSmrg /* If the references has the same address, only prefetch the
7981debfc3dSmrg former. */
7991debfc3dSmrg if (by_is_before)
8001debfc3dSmrg ref->prefetch_before = 0;
8011debfc3dSmrg
8021debfc3dSmrg return;
8031debfc3dSmrg }
8041debfc3dSmrg
8051debfc3dSmrg if (!step)
8061debfc3dSmrg {
8071debfc3dSmrg /* If the reference addresses are invariant and fall into the
8081debfc3dSmrg same cache line, prefetch just the first one. */
8091debfc3dSmrg if (!by_is_before)
8101debfc3dSmrg return;
8111debfc3dSmrg
8121debfc3dSmrg if (ddown (ref->delta, PREFETCH_BLOCK)
8131debfc3dSmrg != ddown (by->delta, PREFETCH_BLOCK))
8141debfc3dSmrg return;
8151debfc3dSmrg
8161debfc3dSmrg ref->prefetch_before = 0;
8171debfc3dSmrg return;
8181debfc3dSmrg }
8191debfc3dSmrg
8201debfc3dSmrg /* Only prune the reference that is behind in the array. */
8211debfc3dSmrg if (backward)
8221debfc3dSmrg {
8231debfc3dSmrg if (delta > 0)
8241debfc3dSmrg return;
8251debfc3dSmrg
8261debfc3dSmrg /* Transform the data so that we may assume that the accesses
8271debfc3dSmrg are forward. */
8281debfc3dSmrg delta = - delta;
8291debfc3dSmrg step = -step;
8301debfc3dSmrg delta_r = PREFETCH_BLOCK - 1 - delta_r;
8311debfc3dSmrg delta_b = PREFETCH_BLOCK - 1 - delta_b;
8321debfc3dSmrg }
8331debfc3dSmrg else
8341debfc3dSmrg {
8351debfc3dSmrg if (delta < 0)
8361debfc3dSmrg return;
8371debfc3dSmrg }
8381debfc3dSmrg
8391debfc3dSmrg /* Check whether the two references are likely to hit the same cache
8401debfc3dSmrg line, and how distant the iterations in that it occurs are from
8411debfc3dSmrg each other. */
8421debfc3dSmrg
8431debfc3dSmrg if (step <= PREFETCH_BLOCK)
8441debfc3dSmrg {
8451debfc3dSmrg /* The accesses are sure to meet. Let us check when. */
8461debfc3dSmrg hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
8471debfc3dSmrg prefetch_before = (hit_from - delta_r + step - 1) / step;
8481debfc3dSmrg
8491debfc3dSmrg /* Do not reduce prefetch_before if we meet beyond cache size. */
8501debfc3dSmrg if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
8511debfc3dSmrg prefetch_before = PREFETCH_ALL;
8521debfc3dSmrg if (prefetch_before < ref->prefetch_before)
8531debfc3dSmrg ref->prefetch_before = prefetch_before;
8541debfc3dSmrg
8551debfc3dSmrg return;
8561debfc3dSmrg }
8571debfc3dSmrg
8581debfc3dSmrg /* A more complicated case with step > prefetch_block. First reduce
8591debfc3dSmrg the ratio between the step and the cache line size to its simplest
8601debfc3dSmrg terms. The resulting denominator will then represent the number of
8611debfc3dSmrg distinct iterations after which each address will go back to its
8621debfc3dSmrg initial location within the cache line. This computation assumes
8631debfc3dSmrg that PREFETCH_BLOCK is a power of two. */
8641debfc3dSmrg prefetch_block = PREFETCH_BLOCK;
8651debfc3dSmrg reduced_prefetch_block = prefetch_block;
8661debfc3dSmrg reduced_step = step;
8671debfc3dSmrg while ((reduced_step & 1) == 0
8681debfc3dSmrg && reduced_prefetch_block > 1)
8691debfc3dSmrg {
8701debfc3dSmrg reduced_step >>= 1;
8711debfc3dSmrg reduced_prefetch_block >>= 1;
8721debfc3dSmrg }
8731debfc3dSmrg
8741debfc3dSmrg prefetch_before = delta / step;
8751debfc3dSmrg delta %= step;
8761debfc3dSmrg ref_type = TREE_TYPE (ref->mem);
8771debfc3dSmrg align_unit = TYPE_ALIGN (ref_type) / 8;
8781debfc3dSmrg if (is_miss_rate_acceptable (prefetch_block, step, delta,
8791debfc3dSmrg reduced_prefetch_block, align_unit))
8801debfc3dSmrg {
8811debfc3dSmrg /* Do not reduce prefetch_before if we meet beyond cache size. */
8821debfc3dSmrg if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
8831debfc3dSmrg prefetch_before = PREFETCH_ALL;
8841debfc3dSmrg if (prefetch_before < ref->prefetch_before)
8851debfc3dSmrg ref->prefetch_before = prefetch_before;
8861debfc3dSmrg
8871debfc3dSmrg return;
8881debfc3dSmrg }
8891debfc3dSmrg
8901debfc3dSmrg /* Try also the following iteration. */
8911debfc3dSmrg prefetch_before++;
8921debfc3dSmrg delta = step - delta;
8931debfc3dSmrg if (is_miss_rate_acceptable (prefetch_block, step, delta,
8941debfc3dSmrg reduced_prefetch_block, align_unit))
8951debfc3dSmrg {
8961debfc3dSmrg if (prefetch_before < ref->prefetch_before)
8971debfc3dSmrg ref->prefetch_before = prefetch_before;
8981debfc3dSmrg
8991debfc3dSmrg return;
9001debfc3dSmrg }
9011debfc3dSmrg
9021debfc3dSmrg /* The ref probably does not reuse by. */
9031debfc3dSmrg return;
9041debfc3dSmrg }
9051debfc3dSmrg
9061debfc3dSmrg /* Prune the prefetch candidate REF using the reuses with other references
9071debfc3dSmrg in REFS. */
9081debfc3dSmrg
9091debfc3dSmrg static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)9101debfc3dSmrg prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
9111debfc3dSmrg {
9121debfc3dSmrg struct mem_ref *prune_by;
9131debfc3dSmrg bool before = true;
9141debfc3dSmrg
9151debfc3dSmrg prune_ref_by_self_reuse (ref);
9161debfc3dSmrg
9171debfc3dSmrg for (prune_by = refs; prune_by; prune_by = prune_by->next)
9181debfc3dSmrg {
9191debfc3dSmrg if (prune_by == ref)
9201debfc3dSmrg {
9211debfc3dSmrg before = false;
9221debfc3dSmrg continue;
9231debfc3dSmrg }
9241debfc3dSmrg
9251debfc3dSmrg if (!WRITE_CAN_USE_READ_PREFETCH
9261debfc3dSmrg && ref->write_p
9271debfc3dSmrg && !prune_by->write_p)
9281debfc3dSmrg continue;
9291debfc3dSmrg if (!READ_CAN_USE_WRITE_PREFETCH
9301debfc3dSmrg && !ref->write_p
9311debfc3dSmrg && prune_by->write_p)
9321debfc3dSmrg continue;
9331debfc3dSmrg
9341debfc3dSmrg prune_ref_by_group_reuse (ref, prune_by, before);
9351debfc3dSmrg }
9361debfc3dSmrg }
9371debfc3dSmrg
9381debfc3dSmrg /* Prune the prefetch candidates in GROUP using the reuse analysis. */
9391debfc3dSmrg
9401debfc3dSmrg static void
prune_group_by_reuse(struct mem_ref_group * group)9411debfc3dSmrg prune_group_by_reuse (struct mem_ref_group *group)
9421debfc3dSmrg {
9431debfc3dSmrg struct mem_ref *ref_pruned;
9441debfc3dSmrg
9451debfc3dSmrg for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
9461debfc3dSmrg {
9471debfc3dSmrg prune_ref_by_reuse (ref_pruned, group->refs);
9481debfc3dSmrg
9491debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
9501debfc3dSmrg {
951a2dc1f3fSmrg dump_mem_ref (dump_file, ref_pruned);
9521debfc3dSmrg
9531debfc3dSmrg if (ref_pruned->prefetch_before == PREFETCH_ALL
9541debfc3dSmrg && ref_pruned->prefetch_mod == 1)
9551debfc3dSmrg fprintf (dump_file, " no restrictions");
9561debfc3dSmrg else if (ref_pruned->prefetch_before == 0)
9571debfc3dSmrg fprintf (dump_file, " do not prefetch");
9581debfc3dSmrg else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
9591debfc3dSmrg fprintf (dump_file, " prefetch once");
9601debfc3dSmrg else
9611debfc3dSmrg {
9621debfc3dSmrg if (ref_pruned->prefetch_before != PREFETCH_ALL)
9631debfc3dSmrg {
9641debfc3dSmrg fprintf (dump_file, " prefetch before ");
9651debfc3dSmrg fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
9661debfc3dSmrg ref_pruned->prefetch_before);
9671debfc3dSmrg }
9681debfc3dSmrg if (ref_pruned->prefetch_mod != 1)
9691debfc3dSmrg {
9701debfc3dSmrg fprintf (dump_file, " prefetch mod ");
9711debfc3dSmrg fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
9721debfc3dSmrg ref_pruned->prefetch_mod);
9731debfc3dSmrg }
9741debfc3dSmrg }
9751debfc3dSmrg fprintf (dump_file, "\n");
9761debfc3dSmrg }
9771debfc3dSmrg }
9781debfc3dSmrg }
9791debfc3dSmrg
9801debfc3dSmrg /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
9811debfc3dSmrg
9821debfc3dSmrg static void
prune_by_reuse(struct mem_ref_group * groups)9831debfc3dSmrg prune_by_reuse (struct mem_ref_group *groups)
9841debfc3dSmrg {
9851debfc3dSmrg for (; groups; groups = groups->next)
9861debfc3dSmrg prune_group_by_reuse (groups);
9871debfc3dSmrg }
9881debfc3dSmrg
9891debfc3dSmrg /* Returns true if we should issue prefetch for REF. */
9901debfc3dSmrg
9911debfc3dSmrg static bool
should_issue_prefetch_p(struct mem_ref * ref)9921debfc3dSmrg should_issue_prefetch_p (struct mem_ref *ref)
9931debfc3dSmrg {
994c0a68be4Smrg /* Do we want to issue prefetches for non-constant strides? */
995*8feb0f0bSmrg if (!cst_and_fits_in_hwi (ref->group->step)
996*8feb0f0bSmrg && param_prefetch_dynamic_strides == 0)
997c0a68be4Smrg {
998c0a68be4Smrg if (dump_file && (dump_flags & TDF_DETAILS))
999c0a68be4Smrg fprintf (dump_file,
1000c0a68be4Smrg "Skipping non-constant step for reference %u:%u\n",
1001c0a68be4Smrg ref->group->uid, ref->uid);
1002c0a68be4Smrg return false;
1003c0a68be4Smrg }
1004c0a68be4Smrg
1005c0a68be4Smrg /* Some processors may have a hardware prefetcher that may conflict with
1006c0a68be4Smrg prefetch hints for a range of strides. Make sure we don't issue
1007c0a68be4Smrg prefetches for such cases if the stride is within this particular
1008c0a68be4Smrg range. */
1009c0a68be4Smrg if (cst_and_fits_in_hwi (ref->group->step)
1010c0a68be4Smrg && abs_hwi (int_cst_value (ref->group->step))
1011*8feb0f0bSmrg < (HOST_WIDE_INT) param_prefetch_minimum_stride)
1012c0a68be4Smrg {
1013c0a68be4Smrg if (dump_file && (dump_flags & TDF_DETAILS))
1014c0a68be4Smrg fprintf (dump_file,
1015c0a68be4Smrg "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
1016c0a68be4Smrg ") is less than the mininum required stride of %d\n",
1017c0a68be4Smrg ref->group->uid, ref->uid, int_cst_value (ref->group->step),
1018*8feb0f0bSmrg param_prefetch_minimum_stride);
1019c0a68be4Smrg return false;
1020c0a68be4Smrg }
1021c0a68be4Smrg
10221debfc3dSmrg /* For now do not issue prefetches for only first few of the
10231debfc3dSmrg iterations. */
10241debfc3dSmrg if (ref->prefetch_before != PREFETCH_ALL)
10251debfc3dSmrg {
10261debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
1027a2dc1f3fSmrg fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
1028a2dc1f3fSmrg ref->group->uid, ref->uid);
10291debfc3dSmrg return false;
10301debfc3dSmrg }
10311debfc3dSmrg
10321debfc3dSmrg /* Do not prefetch nontemporal stores. */
10331debfc3dSmrg if (ref->storent_p)
10341debfc3dSmrg {
10351debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
1036a2dc1f3fSmrg fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
10371debfc3dSmrg return false;
10381debfc3dSmrg }
10391debfc3dSmrg
10401debfc3dSmrg return true;
10411debfc3dSmrg }
10421debfc3dSmrg
10431debfc3dSmrg /* Decide which of the prefetch candidates in GROUPS to prefetch.
10441debfc3dSmrg AHEAD is the number of iterations to prefetch ahead (which corresponds
10451debfc3dSmrg to the number of simultaneous instances of one prefetch running at a
10461debfc3dSmrg time). UNROLL_FACTOR is the factor by that the loop is going to be
10471debfc3dSmrg unrolled. Returns true if there is anything to prefetch. */
10481debfc3dSmrg
10491debfc3dSmrg static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)10501debfc3dSmrg schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
10511debfc3dSmrg unsigned ahead)
10521debfc3dSmrg {
10531debfc3dSmrg unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
10541debfc3dSmrg unsigned slots_per_prefetch;
10551debfc3dSmrg struct mem_ref *ref;
10561debfc3dSmrg bool any = false;
10571debfc3dSmrg
1058*8feb0f0bSmrg /* At most param_simultaneous_prefetches should be running
1059*8feb0f0bSmrg at the same time. */
1060*8feb0f0bSmrg remaining_prefetch_slots = param_simultaneous_prefetches;
10611debfc3dSmrg
10621debfc3dSmrg /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
10631debfc3dSmrg AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
10641debfc3dSmrg it will need a prefetch slot. */
10651debfc3dSmrg slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
10661debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
10671debfc3dSmrg fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
10681debfc3dSmrg slots_per_prefetch);
10691debfc3dSmrg
10701debfc3dSmrg /* For now we just take memory references one by one and issue
10711debfc3dSmrg prefetches for as many as possible. The groups are sorted
10721debfc3dSmrg starting with the largest step, since the references with
10731debfc3dSmrg large step are more likely to cause many cache misses. */
10741debfc3dSmrg
10751debfc3dSmrg for (; groups; groups = groups->next)
10761debfc3dSmrg for (ref = groups->refs; ref; ref = ref->next)
10771debfc3dSmrg {
10781debfc3dSmrg if (!should_issue_prefetch_p (ref))
10791debfc3dSmrg continue;
10801debfc3dSmrg
10811debfc3dSmrg /* The loop is far from being sufficiently unrolled for this
10821debfc3dSmrg prefetch. Do not generate prefetch to avoid many redudant
10831debfc3dSmrg prefetches. */
10841debfc3dSmrg if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
10851debfc3dSmrg continue;
10861debfc3dSmrg
10871debfc3dSmrg /* If we need to prefetch the reference each PREFETCH_MOD iterations,
10881debfc3dSmrg and we unroll the loop UNROLL_FACTOR times, we need to insert
10891debfc3dSmrg ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
10901debfc3dSmrg iteration. */
10911debfc3dSmrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
10921debfc3dSmrg / ref->prefetch_mod);
10931debfc3dSmrg prefetch_slots = n_prefetches * slots_per_prefetch;
10941debfc3dSmrg
10951debfc3dSmrg /* If more than half of the prefetches would be lost anyway, do not
10961debfc3dSmrg issue the prefetch. */
10971debfc3dSmrg if (2 * remaining_prefetch_slots < prefetch_slots)
10981debfc3dSmrg continue;
10991debfc3dSmrg
1100a2dc1f3fSmrg /* Stop prefetching if debug counter is activated. */
1101a2dc1f3fSmrg if (!dbg_cnt (prefetch))
1102a2dc1f3fSmrg continue;
1103a2dc1f3fSmrg
11041debfc3dSmrg ref->issue_prefetch_p = true;
1105a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
1106a2dc1f3fSmrg fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
1107a2dc1f3fSmrg ref->group->uid, ref->uid);
11081debfc3dSmrg
11091debfc3dSmrg if (remaining_prefetch_slots <= prefetch_slots)
11101debfc3dSmrg return true;
11111debfc3dSmrg remaining_prefetch_slots -= prefetch_slots;
11121debfc3dSmrg any = true;
11131debfc3dSmrg }
11141debfc3dSmrg
11151debfc3dSmrg return any;
11161debfc3dSmrg }
11171debfc3dSmrg
11181debfc3dSmrg /* Return TRUE if no prefetch is going to be generated in the given
11191debfc3dSmrg GROUPS. */
11201debfc3dSmrg
11211debfc3dSmrg static bool
nothing_to_prefetch_p(struct mem_ref_group * groups)11221debfc3dSmrg nothing_to_prefetch_p (struct mem_ref_group *groups)
11231debfc3dSmrg {
11241debfc3dSmrg struct mem_ref *ref;
11251debfc3dSmrg
11261debfc3dSmrg for (; groups; groups = groups->next)
11271debfc3dSmrg for (ref = groups->refs; ref; ref = ref->next)
11281debfc3dSmrg if (should_issue_prefetch_p (ref))
11291debfc3dSmrg return false;
11301debfc3dSmrg
11311debfc3dSmrg return true;
11321debfc3dSmrg }
11331debfc3dSmrg
11341debfc3dSmrg /* Estimate the number of prefetches in the given GROUPS.
11351debfc3dSmrg UNROLL_FACTOR is the factor by which LOOP was unrolled. */
11361debfc3dSmrg
11371debfc3dSmrg static int
estimate_prefetch_count(struct mem_ref_group * groups,unsigned unroll_factor)11381debfc3dSmrg estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
11391debfc3dSmrg {
11401debfc3dSmrg struct mem_ref *ref;
11411debfc3dSmrg unsigned n_prefetches;
11421debfc3dSmrg int prefetch_count = 0;
11431debfc3dSmrg
11441debfc3dSmrg for (; groups; groups = groups->next)
11451debfc3dSmrg for (ref = groups->refs; ref; ref = ref->next)
11461debfc3dSmrg if (should_issue_prefetch_p (ref))
11471debfc3dSmrg {
11481debfc3dSmrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
11491debfc3dSmrg / ref->prefetch_mod);
11501debfc3dSmrg prefetch_count += n_prefetches;
11511debfc3dSmrg }
11521debfc3dSmrg
11531debfc3dSmrg return prefetch_count;
11541debfc3dSmrg }
11551debfc3dSmrg
11561debfc3dSmrg /* Issue prefetches for the reference REF into loop as decided before.
11571debfc3dSmrg HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
11581debfc3dSmrg is the factor by which LOOP was unrolled. */
11591debfc3dSmrg
11601debfc3dSmrg static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)11611debfc3dSmrg issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
11621debfc3dSmrg {
11631debfc3dSmrg HOST_WIDE_INT delta;
11641debfc3dSmrg tree addr, addr_base, write_p, local, forward;
11651debfc3dSmrg gcall *prefetch;
11661debfc3dSmrg gimple_stmt_iterator bsi;
11671debfc3dSmrg unsigned n_prefetches, ap;
11681debfc3dSmrg bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
11691debfc3dSmrg
11701debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
1171a2dc1f3fSmrg fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
11721debfc3dSmrg nontemporal ? " nontemporal" : "",
1173a2dc1f3fSmrg ref->group->uid, ref->uid);
11741debfc3dSmrg
11751debfc3dSmrg bsi = gsi_for_stmt (ref->stmt);
11761debfc3dSmrg
11771debfc3dSmrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
11781debfc3dSmrg / ref->prefetch_mod);
11791debfc3dSmrg addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
11801debfc3dSmrg addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
11811debfc3dSmrg true, NULL, true, GSI_SAME_STMT);
11821debfc3dSmrg write_p = ref->write_p ? integer_one_node : integer_zero_node;
11831debfc3dSmrg local = nontemporal ? integer_zero_node : integer_three_node;
11841debfc3dSmrg
11851debfc3dSmrg for (ap = 0; ap < n_prefetches; ap++)
11861debfc3dSmrg {
11871debfc3dSmrg if (cst_and_fits_in_hwi (ref->group->step))
11881debfc3dSmrg {
11891debfc3dSmrg /* Determine the address to prefetch. */
11901debfc3dSmrg delta = (ahead + ap * ref->prefetch_mod) *
11911debfc3dSmrg int_cst_value (ref->group->step);
11921debfc3dSmrg addr = fold_build_pointer_plus_hwi (addr_base, delta);
1193a2dc1f3fSmrg addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1194a2dc1f3fSmrg NULL, true, GSI_SAME_STMT);
11951debfc3dSmrg }
11961debfc3dSmrg else
11971debfc3dSmrg {
11981debfc3dSmrg /* The step size is non-constant but loop-invariant. We use the
11991debfc3dSmrg heuristic to simply prefetch ahead iterations ahead. */
12001debfc3dSmrg forward = fold_build2 (MULT_EXPR, sizetype,
12011debfc3dSmrg fold_convert (sizetype, ref->group->step),
12021debfc3dSmrg fold_convert (sizetype, size_int (ahead)));
12031debfc3dSmrg addr = fold_build_pointer_plus (addr_base, forward);
12041debfc3dSmrg addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
12051debfc3dSmrg NULL, true, GSI_SAME_STMT);
12061debfc3dSmrg }
12071debfc3dSmrg
12081debfc3dSmrg if (addr_base != addr
12091debfc3dSmrg && TREE_CODE (addr_base) == SSA_NAME
12101debfc3dSmrg && TREE_CODE (addr) == SSA_NAME)
12111debfc3dSmrg {
12121debfc3dSmrg duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
12131debfc3dSmrg /* As this isn't a plain copy we have to reset alignment
12141debfc3dSmrg information. */
12151debfc3dSmrg if (SSA_NAME_PTR_INFO (addr))
12161debfc3dSmrg mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
12171debfc3dSmrg }
12181debfc3dSmrg
12191debfc3dSmrg /* Create the prefetch instruction. */
12201debfc3dSmrg prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
12211debfc3dSmrg 3, addr, write_p, local);
12221debfc3dSmrg gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
12231debfc3dSmrg }
12241debfc3dSmrg }
12251debfc3dSmrg
12261debfc3dSmrg /* Issue prefetches for the references in GROUPS into loop as decided before.
12271debfc3dSmrg HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
12281debfc3dSmrg factor by that LOOP was unrolled. */
12291debfc3dSmrg
12301debfc3dSmrg static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)12311debfc3dSmrg issue_prefetches (struct mem_ref_group *groups,
12321debfc3dSmrg unsigned unroll_factor, unsigned ahead)
12331debfc3dSmrg {
12341debfc3dSmrg struct mem_ref *ref;
12351debfc3dSmrg
12361debfc3dSmrg for (; groups; groups = groups->next)
12371debfc3dSmrg for (ref = groups->refs; ref; ref = ref->next)
12381debfc3dSmrg if (ref->issue_prefetch_p)
12391debfc3dSmrg issue_prefetch_ref (ref, unroll_factor, ahead);
12401debfc3dSmrg }
12411debfc3dSmrg
12421debfc3dSmrg /* Returns true if REF is a memory write for that a nontemporal store insn
12431debfc3dSmrg can be used. */
12441debfc3dSmrg
12451debfc3dSmrg static bool
nontemporal_store_p(struct mem_ref * ref)12461debfc3dSmrg nontemporal_store_p (struct mem_ref *ref)
12471debfc3dSmrg {
12481debfc3dSmrg machine_mode mode;
12491debfc3dSmrg enum insn_code code;
12501debfc3dSmrg
12511debfc3dSmrg /* REF must be a write that is not reused. We require it to be independent
12521debfc3dSmrg on all other memory references in the loop, as the nontemporal stores may
12531debfc3dSmrg be reordered with respect to other memory references. */
12541debfc3dSmrg if (!ref->write_p
12551debfc3dSmrg || !ref->independent_p
12561debfc3dSmrg || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
12571debfc3dSmrg return false;
12581debfc3dSmrg
12591debfc3dSmrg /* Check that we have the storent instruction for the mode. */
12601debfc3dSmrg mode = TYPE_MODE (TREE_TYPE (ref->mem));
12611debfc3dSmrg if (mode == BLKmode)
12621debfc3dSmrg return false;
12631debfc3dSmrg
12641debfc3dSmrg code = optab_handler (storent_optab, mode);
12651debfc3dSmrg return code != CODE_FOR_nothing;
12661debfc3dSmrg }
12671debfc3dSmrg
12681debfc3dSmrg /* If REF is a nontemporal store, we mark the corresponding modify statement
12691debfc3dSmrg and return true. Otherwise, we return false. */
12701debfc3dSmrg
12711debfc3dSmrg static bool
mark_nontemporal_store(struct mem_ref * ref)12721debfc3dSmrg mark_nontemporal_store (struct mem_ref *ref)
12731debfc3dSmrg {
12741debfc3dSmrg if (!nontemporal_store_p (ref))
12751debfc3dSmrg return false;
12761debfc3dSmrg
12771debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
1278a2dc1f3fSmrg fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
1279a2dc1f3fSmrg ref->group->uid, ref->uid);
12801debfc3dSmrg
12811debfc3dSmrg gimple_assign_set_nontemporal_move (ref->stmt, true);
12821debfc3dSmrg ref->storent_p = true;
12831debfc3dSmrg
12841debfc3dSmrg return true;
12851debfc3dSmrg }
12861debfc3dSmrg
12871debfc3dSmrg /* Issue a memory fence instruction after LOOP. */
12881debfc3dSmrg
12891debfc3dSmrg static void
emit_mfence_after_loop(class loop * loop)1290*8feb0f0bSmrg emit_mfence_after_loop (class loop *loop)
12911debfc3dSmrg {
12921debfc3dSmrg vec<edge> exits = get_loop_exit_edges (loop);
12931debfc3dSmrg edge exit;
12941debfc3dSmrg gcall *call;
12951debfc3dSmrg gimple_stmt_iterator bsi;
12961debfc3dSmrg unsigned i;
12971debfc3dSmrg
12981debfc3dSmrg FOR_EACH_VEC_ELT (exits, i, exit)
12991debfc3dSmrg {
13001debfc3dSmrg call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
13011debfc3dSmrg
13021debfc3dSmrg if (!single_pred_p (exit->dest)
13031debfc3dSmrg /* If possible, we prefer not to insert the fence on other paths
13041debfc3dSmrg in cfg. */
13051debfc3dSmrg && !(exit->flags & EDGE_ABNORMAL))
13061debfc3dSmrg split_loop_exit_edge (exit);
13071debfc3dSmrg bsi = gsi_after_labels (exit->dest);
13081debfc3dSmrg
13091debfc3dSmrg gsi_insert_before (&bsi, call, GSI_NEW_STMT);
13101debfc3dSmrg }
13111debfc3dSmrg
13121debfc3dSmrg exits.release ();
13131debfc3dSmrg update_ssa (TODO_update_ssa_only_virtuals);
13141debfc3dSmrg }
13151debfc3dSmrg
13161debfc3dSmrg /* Returns true if we can use storent in loop, false otherwise. */
13171debfc3dSmrg
13181debfc3dSmrg static bool
may_use_storent_in_loop_p(class loop * loop)1319*8feb0f0bSmrg may_use_storent_in_loop_p (class loop *loop)
13201debfc3dSmrg {
13211debfc3dSmrg bool ret = true;
13221debfc3dSmrg
13231debfc3dSmrg if (loop->inner != NULL)
13241debfc3dSmrg return false;
13251debfc3dSmrg
13261debfc3dSmrg /* If we must issue a mfence insn after using storent, check that there
13271debfc3dSmrg is a suitable place for it at each of the loop exits. */
13281debfc3dSmrg if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
13291debfc3dSmrg {
13301debfc3dSmrg vec<edge> exits = get_loop_exit_edges (loop);
13311debfc3dSmrg unsigned i;
13321debfc3dSmrg edge exit;
13331debfc3dSmrg
13341debfc3dSmrg FOR_EACH_VEC_ELT (exits, i, exit)
13351debfc3dSmrg if ((exit->flags & EDGE_ABNORMAL)
13361debfc3dSmrg && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
13371debfc3dSmrg ret = false;
13381debfc3dSmrg
13391debfc3dSmrg exits.release ();
13401debfc3dSmrg }
13411debfc3dSmrg
13421debfc3dSmrg return ret;
13431debfc3dSmrg }
13441debfc3dSmrg
13451debfc3dSmrg /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
13461debfc3dSmrg references in the loop. */
13471debfc3dSmrg
13481debfc3dSmrg static void
mark_nontemporal_stores(class loop * loop,struct mem_ref_group * groups)1349*8feb0f0bSmrg mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
13501debfc3dSmrg {
13511debfc3dSmrg struct mem_ref *ref;
13521debfc3dSmrg bool any = false;
13531debfc3dSmrg
13541debfc3dSmrg if (!may_use_storent_in_loop_p (loop))
13551debfc3dSmrg return;
13561debfc3dSmrg
13571debfc3dSmrg for (; groups; groups = groups->next)
13581debfc3dSmrg for (ref = groups->refs; ref; ref = ref->next)
13591debfc3dSmrg any |= mark_nontemporal_store (ref);
13601debfc3dSmrg
13611debfc3dSmrg if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
13621debfc3dSmrg emit_mfence_after_loop (loop);
13631debfc3dSmrg }
13641debfc3dSmrg
13651debfc3dSmrg /* Determines whether we can profitably unroll LOOP FACTOR times, and if
13661debfc3dSmrg this is the case, fill in DESC by the description of number of
13671debfc3dSmrg iterations. */
13681debfc3dSmrg
13691debfc3dSmrg static bool
should_unroll_loop_p(class loop * loop,class tree_niter_desc * desc,unsigned factor)1370*8feb0f0bSmrg should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
13711debfc3dSmrg unsigned factor)
13721debfc3dSmrg {
13731debfc3dSmrg if (!can_unroll_loop_p (loop, factor, desc))
13741debfc3dSmrg return false;
13751debfc3dSmrg
13761debfc3dSmrg /* We only consider loops without control flow for unrolling. This is not
13771debfc3dSmrg a hard restriction -- tree_unroll_loop works with arbitrary loops
13781debfc3dSmrg as well; but the unrolling/prefetching is usually more profitable for
13791debfc3dSmrg loops consisting of a single basic block, and we want to limit the
13801debfc3dSmrg code growth. */
13811debfc3dSmrg if (loop->num_nodes > 2)
13821debfc3dSmrg return false;
13831debfc3dSmrg
13841debfc3dSmrg return true;
13851debfc3dSmrg }
13861debfc3dSmrg
13871debfc3dSmrg /* Determine the coefficient by that unroll LOOP, from the information
13881debfc3dSmrg contained in the list of memory references REFS. Description of
1389a2dc1f3fSmrg number of iterations of LOOP is stored to DESC. NINSNS is the number of
13901debfc3dSmrg insns of the LOOP. EST_NITER is the estimated number of iterations of
13911debfc3dSmrg the loop, or -1 if no estimate is available. */
13921debfc3dSmrg
13931debfc3dSmrg static unsigned
determine_unroll_factor(class loop * loop,struct mem_ref_group * refs,unsigned ninsns,class tree_niter_desc * desc,HOST_WIDE_INT est_niter)1394*8feb0f0bSmrg determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
1395*8feb0f0bSmrg unsigned ninsns, class tree_niter_desc *desc,
13961debfc3dSmrg HOST_WIDE_INT est_niter)
13971debfc3dSmrg {
13981debfc3dSmrg unsigned upper_bound;
13991debfc3dSmrg unsigned nfactor, factor, mod_constraint;
14001debfc3dSmrg struct mem_ref_group *agp;
14011debfc3dSmrg struct mem_ref *ref;
14021debfc3dSmrg
14031debfc3dSmrg /* First check whether the loop is not too large to unroll. We ignore
14041debfc3dSmrg PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
14051debfc3dSmrg from unrolling them enough to make exactly one cache line covered by each
14061debfc3dSmrg iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
14071debfc3dSmrg us from unrolling the loops too many times in cases where we only expect
14081debfc3dSmrg gains from better scheduling and decreasing loop overhead, which is not
14091debfc3dSmrg the case here. */
1410*8feb0f0bSmrg upper_bound = param_max_unrolled_insns / ninsns;
14111debfc3dSmrg
14121debfc3dSmrg /* If we unrolled the loop more times than it iterates, the unrolled version
14131debfc3dSmrg of the loop would be never entered. */
14141debfc3dSmrg if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
14151debfc3dSmrg upper_bound = est_niter;
14161debfc3dSmrg
14171debfc3dSmrg if (upper_bound <= 1)
14181debfc3dSmrg return 1;
14191debfc3dSmrg
14201debfc3dSmrg /* Choose the factor so that we may prefetch each cache just once,
14211debfc3dSmrg but bound the unrolling by UPPER_BOUND. */
14221debfc3dSmrg factor = 1;
14231debfc3dSmrg for (agp = refs; agp; agp = agp->next)
14241debfc3dSmrg for (ref = agp->refs; ref; ref = ref->next)
14251debfc3dSmrg if (should_issue_prefetch_p (ref))
14261debfc3dSmrg {
14271debfc3dSmrg mod_constraint = ref->prefetch_mod;
14281debfc3dSmrg nfactor = least_common_multiple (mod_constraint, factor);
14291debfc3dSmrg if (nfactor <= upper_bound)
14301debfc3dSmrg factor = nfactor;
14311debfc3dSmrg }
14321debfc3dSmrg
14331debfc3dSmrg if (!should_unroll_loop_p (loop, desc, factor))
14341debfc3dSmrg return 1;
14351debfc3dSmrg
14361debfc3dSmrg return factor;
14371debfc3dSmrg }
14381debfc3dSmrg
14391debfc3dSmrg /* Returns the total volume of the memory references REFS, taking into account
14401debfc3dSmrg reuses in the innermost loop and cache line size. TODO -- we should also
14411debfc3dSmrg take into account reuses across the iterations of the loops in the loop
14421debfc3dSmrg nest. */
14431debfc3dSmrg
14441debfc3dSmrg static unsigned
volume_of_references(struct mem_ref_group * refs)14451debfc3dSmrg volume_of_references (struct mem_ref_group *refs)
14461debfc3dSmrg {
14471debfc3dSmrg unsigned volume = 0;
14481debfc3dSmrg struct mem_ref_group *gr;
14491debfc3dSmrg struct mem_ref *ref;
14501debfc3dSmrg
14511debfc3dSmrg for (gr = refs; gr; gr = gr->next)
14521debfc3dSmrg for (ref = gr->refs; ref; ref = ref->next)
14531debfc3dSmrg {
14541debfc3dSmrg /* Almost always reuses another value? */
14551debfc3dSmrg if (ref->prefetch_before != PREFETCH_ALL)
14561debfc3dSmrg continue;
14571debfc3dSmrg
14581debfc3dSmrg /* If several iterations access the same cache line, use the size of
14591debfc3dSmrg the line divided by this number. Otherwise, a cache line is
14601debfc3dSmrg accessed in each iteration. TODO -- in the latter case, we should
14611debfc3dSmrg take the size of the reference into account, rounding it up on cache
14621debfc3dSmrg line size multiple. */
1463*8feb0f0bSmrg volume += param_l1_cache_line_size / ref->prefetch_mod;
14641debfc3dSmrg }
14651debfc3dSmrg return volume;
14661debfc3dSmrg }
14671debfc3dSmrg
14681debfc3dSmrg /* Returns the volume of memory references accessed across VEC iterations of
14691debfc3dSmrg loops, whose sizes are described in the LOOP_SIZES array. N is the number
14701debfc3dSmrg of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
14711debfc3dSmrg
14721debfc3dSmrg static unsigned
volume_of_dist_vector(lambda_vector vec,unsigned * loop_sizes,unsigned n)14731debfc3dSmrg volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
14741debfc3dSmrg {
14751debfc3dSmrg unsigned i;
14761debfc3dSmrg
14771debfc3dSmrg for (i = 0; i < n; i++)
14781debfc3dSmrg if (vec[i] != 0)
14791debfc3dSmrg break;
14801debfc3dSmrg
14811debfc3dSmrg if (i == n)
14821debfc3dSmrg return 0;
14831debfc3dSmrg
14841debfc3dSmrg gcc_assert (vec[i] > 0);
14851debfc3dSmrg
14861debfc3dSmrg /* We ignore the parts of the distance vector in subloops, since usually
14871debfc3dSmrg the numbers of iterations are much smaller. */
14881debfc3dSmrg return loop_sizes[i] * vec[i];
14891debfc3dSmrg }
14901debfc3dSmrg
14911debfc3dSmrg /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
14921debfc3dSmrg at the position corresponding to the loop of the step. N is the depth
14931debfc3dSmrg of the considered loop nest, and, LOOP is its innermost loop. */
14941debfc3dSmrg
14951debfc3dSmrg static void
add_subscript_strides(tree access_fn,unsigned stride,HOST_WIDE_INT * strides,unsigned n,class loop * loop)14961debfc3dSmrg add_subscript_strides (tree access_fn, unsigned stride,
1497*8feb0f0bSmrg HOST_WIDE_INT *strides, unsigned n, class loop *loop)
14981debfc3dSmrg {
1499*8feb0f0bSmrg class loop *aloop;
15001debfc3dSmrg tree step;
15011debfc3dSmrg HOST_WIDE_INT astep;
15021debfc3dSmrg unsigned min_depth = loop_depth (loop) - n;
15031debfc3dSmrg
15041debfc3dSmrg while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
15051debfc3dSmrg {
15061debfc3dSmrg aloop = get_chrec_loop (access_fn);
15071debfc3dSmrg step = CHREC_RIGHT (access_fn);
15081debfc3dSmrg access_fn = CHREC_LEFT (access_fn);
15091debfc3dSmrg
15101debfc3dSmrg if ((unsigned) loop_depth (aloop) <= min_depth)
15111debfc3dSmrg continue;
15121debfc3dSmrg
15131debfc3dSmrg if (tree_fits_shwi_p (step))
15141debfc3dSmrg astep = tree_to_shwi (step);
15151debfc3dSmrg else
1516*8feb0f0bSmrg astep = param_l1_cache_line_size;
15171debfc3dSmrg
15181debfc3dSmrg strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
15191debfc3dSmrg
15201debfc3dSmrg }
15211debfc3dSmrg }
15221debfc3dSmrg
15231debfc3dSmrg /* Returns the volume of memory references accessed between two consecutive
15241debfc3dSmrg self-reuses of the reference DR. We consider the subscripts of DR in N
15251debfc3dSmrg loops, and LOOP_SIZES contains the volumes of accesses in each of the
15261debfc3dSmrg loops. LOOP is the innermost loop of the current loop nest. */
15271debfc3dSmrg
15281debfc3dSmrg static unsigned
self_reuse_distance(data_reference_p dr,unsigned * loop_sizes,unsigned n,class loop * loop)15291debfc3dSmrg self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1530*8feb0f0bSmrg class loop *loop)
15311debfc3dSmrg {
15321debfc3dSmrg tree stride, access_fn;
15331debfc3dSmrg HOST_WIDE_INT *strides, astride;
15341debfc3dSmrg vec<tree> access_fns;
15351debfc3dSmrg tree ref = DR_REF (dr);
15361debfc3dSmrg unsigned i, ret = ~0u;
15371debfc3dSmrg
15381debfc3dSmrg /* In the following example:
15391debfc3dSmrg
15401debfc3dSmrg for (i = 0; i < N; i++)
15411debfc3dSmrg for (j = 0; j < N; j++)
15421debfc3dSmrg use (a[j][i]);
15431debfc3dSmrg the same cache line is accessed each N steps (except if the change from
15441debfc3dSmrg i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
15451debfc3dSmrg we cannot rely purely on the results of the data dependence analysis.
15461debfc3dSmrg
15471debfc3dSmrg Instead, we compute the stride of the reference in each loop, and consider
15481debfc3dSmrg the innermost loop in that the stride is less than cache size. */
15491debfc3dSmrg
15501debfc3dSmrg strides = XCNEWVEC (HOST_WIDE_INT, n);
15511debfc3dSmrg access_fns = DR_ACCESS_FNS (dr);
15521debfc3dSmrg
15531debfc3dSmrg FOR_EACH_VEC_ELT (access_fns, i, access_fn)
15541debfc3dSmrg {
15551debfc3dSmrg /* Keep track of the reference corresponding to the subscript, so that we
15561debfc3dSmrg know its stride. */
15571debfc3dSmrg while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
15581debfc3dSmrg ref = TREE_OPERAND (ref, 0);
15591debfc3dSmrg
15601debfc3dSmrg if (TREE_CODE (ref) == ARRAY_REF)
15611debfc3dSmrg {
15621debfc3dSmrg stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
15631debfc3dSmrg if (tree_fits_uhwi_p (stride))
15641debfc3dSmrg astride = tree_to_uhwi (stride);
15651debfc3dSmrg else
1566*8feb0f0bSmrg astride = param_l1_cache_line_size;
15671debfc3dSmrg
15681debfc3dSmrg ref = TREE_OPERAND (ref, 0);
15691debfc3dSmrg }
15701debfc3dSmrg else
15711debfc3dSmrg astride = 1;
15721debfc3dSmrg
15731debfc3dSmrg add_subscript_strides (access_fn, astride, strides, n, loop);
15741debfc3dSmrg }
15751debfc3dSmrg
15761debfc3dSmrg for (i = n; i-- > 0; )
15771debfc3dSmrg {
15781debfc3dSmrg unsigned HOST_WIDE_INT s;
15791debfc3dSmrg
15801debfc3dSmrg s = strides[i] < 0 ? -strides[i] : strides[i];
15811debfc3dSmrg
1582*8feb0f0bSmrg if (s < (unsigned) param_l1_cache_line_size
15831debfc3dSmrg && (loop_sizes[i]
15841debfc3dSmrg > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
15851debfc3dSmrg {
15861debfc3dSmrg ret = loop_sizes[i];
15871debfc3dSmrg break;
15881debfc3dSmrg }
15891debfc3dSmrg }
15901debfc3dSmrg
15911debfc3dSmrg free (strides);
15921debfc3dSmrg return ret;
15931debfc3dSmrg }
15941debfc3dSmrg
15951debfc3dSmrg /* Determines the distance till the first reuse of each reference in REFS
15961debfc3dSmrg in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
15971debfc3dSmrg memory references in the loop. Return false if the analysis fails. */
15981debfc3dSmrg
15991debfc3dSmrg static bool
determine_loop_nest_reuse(class loop * loop,struct mem_ref_group * refs,bool no_other_refs)1600*8feb0f0bSmrg determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs,
16011debfc3dSmrg bool no_other_refs)
16021debfc3dSmrg {
1603*8feb0f0bSmrg class loop *nest, *aloop;
16041debfc3dSmrg vec<data_reference_p> datarefs = vNULL;
16051debfc3dSmrg vec<ddr_p> dependences = vNULL;
16061debfc3dSmrg struct mem_ref_group *gr;
16071debfc3dSmrg struct mem_ref *ref, *refb;
16081debfc3dSmrg auto_vec<loop_p> vloops;
16091debfc3dSmrg unsigned *loop_data_size;
16101debfc3dSmrg unsigned i, j, n;
16111debfc3dSmrg unsigned volume, dist, adist;
16121debfc3dSmrg HOST_WIDE_INT vol;
16131debfc3dSmrg data_reference_p dr;
16141debfc3dSmrg ddr_p dep;
16151debfc3dSmrg
16161debfc3dSmrg if (loop->inner)
16171debfc3dSmrg return true;
16181debfc3dSmrg
16191debfc3dSmrg /* Find the outermost loop of the loop nest of loop (we require that
16201debfc3dSmrg there are no sibling loops inside the nest). */
16211debfc3dSmrg nest = loop;
16221debfc3dSmrg while (1)
16231debfc3dSmrg {
16241debfc3dSmrg aloop = loop_outer (nest);
16251debfc3dSmrg
16261debfc3dSmrg if (aloop == current_loops->tree_root
16271debfc3dSmrg || aloop->inner->next)
16281debfc3dSmrg break;
16291debfc3dSmrg
16301debfc3dSmrg nest = aloop;
16311debfc3dSmrg }
16321debfc3dSmrg
16331debfc3dSmrg /* For each loop, determine the amount of data accessed in each iteration.
16341debfc3dSmrg We use this to estimate whether the reference is evicted from the
16351debfc3dSmrg cache before its reuse. */
16361debfc3dSmrg find_loop_nest (nest, &vloops);
16371debfc3dSmrg n = vloops.length ();
16381debfc3dSmrg loop_data_size = XNEWVEC (unsigned, n);
16391debfc3dSmrg volume = volume_of_references (refs);
16401debfc3dSmrg i = n;
16411debfc3dSmrg while (i-- != 0)
16421debfc3dSmrg {
16431debfc3dSmrg loop_data_size[i] = volume;
16441debfc3dSmrg /* Bound the volume by the L2 cache size, since above this bound,
16451debfc3dSmrg all dependence distances are equivalent. */
16461debfc3dSmrg if (volume > L2_CACHE_SIZE_BYTES)
16471debfc3dSmrg continue;
16481debfc3dSmrg
16491debfc3dSmrg aloop = vloops[i];
16501debfc3dSmrg vol = estimated_stmt_executions_int (aloop);
16511debfc3dSmrg if (vol == -1)
16521debfc3dSmrg vol = expected_loop_iterations (aloop);
16531debfc3dSmrg volume *= vol;
16541debfc3dSmrg }
16551debfc3dSmrg
16561debfc3dSmrg /* Prepare the references in the form suitable for data dependence
16571debfc3dSmrg analysis. We ignore unanalyzable data references (the results
16581debfc3dSmrg are used just as a heuristics to estimate temporality of the
16591debfc3dSmrg references, hence we do not need to worry about correctness). */
16601debfc3dSmrg for (gr = refs; gr; gr = gr->next)
16611debfc3dSmrg for (ref = gr->refs; ref; ref = ref->next)
16621debfc3dSmrg {
1663a2dc1f3fSmrg dr = create_data_ref (loop_preheader_edge (nest),
1664a2dc1f3fSmrg loop_containing_stmt (ref->stmt),
1665a2dc1f3fSmrg ref->mem, ref->stmt, !ref->write_p, false);
16661debfc3dSmrg
16671debfc3dSmrg if (dr)
16681debfc3dSmrg {
16691debfc3dSmrg ref->reuse_distance = volume;
16701debfc3dSmrg dr->aux = ref;
16711debfc3dSmrg datarefs.safe_push (dr);
16721debfc3dSmrg }
16731debfc3dSmrg else
16741debfc3dSmrg no_other_refs = false;
16751debfc3dSmrg }
16761debfc3dSmrg
16771debfc3dSmrg FOR_EACH_VEC_ELT (datarefs, i, dr)
16781debfc3dSmrg {
16791debfc3dSmrg dist = self_reuse_distance (dr, loop_data_size, n, loop);
16801debfc3dSmrg ref = (struct mem_ref *) dr->aux;
16811debfc3dSmrg if (ref->reuse_distance > dist)
16821debfc3dSmrg ref->reuse_distance = dist;
16831debfc3dSmrg
16841debfc3dSmrg if (no_other_refs)
16851debfc3dSmrg ref->independent_p = true;
16861debfc3dSmrg }
16871debfc3dSmrg
16881debfc3dSmrg if (!compute_all_dependences (datarefs, &dependences, vloops, true))
16891debfc3dSmrg return false;
16901debfc3dSmrg
16911debfc3dSmrg FOR_EACH_VEC_ELT (dependences, i, dep)
16921debfc3dSmrg {
16931debfc3dSmrg if (DDR_ARE_DEPENDENT (dep) == chrec_known)
16941debfc3dSmrg continue;
16951debfc3dSmrg
16961debfc3dSmrg ref = (struct mem_ref *) DDR_A (dep)->aux;
16971debfc3dSmrg refb = (struct mem_ref *) DDR_B (dep)->aux;
16981debfc3dSmrg
16991debfc3dSmrg if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1700a2dc1f3fSmrg || DDR_COULD_BE_INDEPENDENT_P (dep)
17011debfc3dSmrg || DDR_NUM_DIST_VECTS (dep) == 0)
17021debfc3dSmrg {
17031debfc3dSmrg /* If the dependence cannot be analyzed, assume that there might be
17041debfc3dSmrg a reuse. */
17051debfc3dSmrg dist = 0;
17061debfc3dSmrg
17071debfc3dSmrg ref->independent_p = false;
17081debfc3dSmrg refb->independent_p = false;
17091debfc3dSmrg }
17101debfc3dSmrg else
17111debfc3dSmrg {
17121debfc3dSmrg /* The distance vectors are normalized to be always lexicographically
17131debfc3dSmrg positive, hence we cannot tell just from them whether DDR_A comes
17141debfc3dSmrg before DDR_B or vice versa. However, it is not important,
17151debfc3dSmrg anyway -- if DDR_A is close to DDR_B, then it is either reused in
17161debfc3dSmrg DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
17171debfc3dSmrg in cache (and marking it as nontemporal would not affect
17181debfc3dSmrg anything). */
17191debfc3dSmrg
17201debfc3dSmrg dist = volume;
17211debfc3dSmrg for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
17221debfc3dSmrg {
17231debfc3dSmrg adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
17241debfc3dSmrg loop_data_size, n);
17251debfc3dSmrg
17261debfc3dSmrg /* If this is a dependence in the innermost loop (i.e., the
17271debfc3dSmrg distances in all superloops are zero) and it is not
17281debfc3dSmrg the trivial self-dependence with distance zero, record that
17291debfc3dSmrg the references are not completely independent. */
17301debfc3dSmrg if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
17311debfc3dSmrg && (ref != refb
17321debfc3dSmrg || DDR_DIST_VECT (dep, j)[n-1] != 0))
17331debfc3dSmrg {
17341debfc3dSmrg ref->independent_p = false;
17351debfc3dSmrg refb->independent_p = false;
17361debfc3dSmrg }
17371debfc3dSmrg
17381debfc3dSmrg /* Ignore accesses closer than
17391debfc3dSmrg L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
17401debfc3dSmrg so that we use nontemporal prefetches e.g. if single memory
17411debfc3dSmrg location is accessed several times in a single iteration of
17421debfc3dSmrg the loop. */
17431debfc3dSmrg if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
17441debfc3dSmrg continue;
17451debfc3dSmrg
17461debfc3dSmrg if (adist < dist)
17471debfc3dSmrg dist = adist;
17481debfc3dSmrg }
17491debfc3dSmrg }
17501debfc3dSmrg
17511debfc3dSmrg if (ref->reuse_distance > dist)
17521debfc3dSmrg ref->reuse_distance = dist;
17531debfc3dSmrg if (refb->reuse_distance > dist)
17541debfc3dSmrg refb->reuse_distance = dist;
17551debfc3dSmrg }
17561debfc3dSmrg
17571debfc3dSmrg free_dependence_relations (dependences);
17581debfc3dSmrg free_data_refs (datarefs);
17591debfc3dSmrg free (loop_data_size);
17601debfc3dSmrg
17611debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
17621debfc3dSmrg {
17631debfc3dSmrg fprintf (dump_file, "Reuse distances:\n");
17641debfc3dSmrg for (gr = refs; gr; gr = gr->next)
17651debfc3dSmrg for (ref = gr->refs; ref; ref = ref->next)
1766a2dc1f3fSmrg fprintf (dump_file, " reference %u:%u distance %u\n",
1767a2dc1f3fSmrg ref->group->uid, ref->uid, ref->reuse_distance);
17681debfc3dSmrg }
17691debfc3dSmrg
17701debfc3dSmrg return true;
17711debfc3dSmrg }
17721debfc3dSmrg
17731debfc3dSmrg /* Determine whether or not the trip count to ahead ratio is too small based
17741debfc3dSmrg on prefitablility consideration.
17751debfc3dSmrg AHEAD: the iteration ahead distance,
17761debfc3dSmrg EST_NITER: the estimated trip count. */
17771debfc3dSmrg
17781debfc3dSmrg static bool
trip_count_to_ahead_ratio_too_small_p(unsigned ahead,HOST_WIDE_INT est_niter)17791debfc3dSmrg trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
17801debfc3dSmrg {
17811debfc3dSmrg /* Assume trip count to ahead ratio is big enough if the trip count could not
17821debfc3dSmrg be estimated at compile time. */
17831debfc3dSmrg if (est_niter < 0)
17841debfc3dSmrg return false;
17851debfc3dSmrg
17861debfc3dSmrg if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
17871debfc3dSmrg {
17881debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
17891debfc3dSmrg fprintf (dump_file,
17901debfc3dSmrg "Not prefetching -- loop estimated to roll only %d times\n",
17911debfc3dSmrg (int) est_niter);
17921debfc3dSmrg return true;
17931debfc3dSmrg }
17941debfc3dSmrg
17951debfc3dSmrg return false;
17961debfc3dSmrg }
17971debfc3dSmrg
17981debfc3dSmrg /* Determine whether or not the number of memory references in the loop is
17991debfc3dSmrg reasonable based on the profitablity and compilation time considerations.
18001debfc3dSmrg NINSNS: estimated number of instructions in the loop,
18011debfc3dSmrg MEM_REF_COUNT: total number of memory references in the loop. */
18021debfc3dSmrg
18031debfc3dSmrg static bool
mem_ref_count_reasonable_p(unsigned ninsns,unsigned mem_ref_count)18041debfc3dSmrg mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
18051debfc3dSmrg {
18061debfc3dSmrg int insn_to_mem_ratio;
18071debfc3dSmrg
18081debfc3dSmrg if (mem_ref_count == 0)
18091debfc3dSmrg return false;
18101debfc3dSmrg
18111debfc3dSmrg /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
18121debfc3dSmrg (compute_all_dependences) have high costs based on quadratic complexity.
18131debfc3dSmrg To avoid huge compilation time, we give up prefetching if mem_ref_count
18141debfc3dSmrg is too large. */
18151debfc3dSmrg if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
18161debfc3dSmrg return false;
18171debfc3dSmrg
18181debfc3dSmrg /* Prefetching improves performance by overlapping cache missing
18191debfc3dSmrg memory accesses with CPU operations. If the loop does not have
18201debfc3dSmrg enough CPU operations to overlap with memory operations, prefetching
18211debfc3dSmrg won't give a significant benefit. One approximate way of checking
18221debfc3dSmrg this is to require the ratio of instructions to memory references to
18231debfc3dSmrg be above a certain limit. This approximation works well in practice.
18241debfc3dSmrg TODO: Implement a more precise computation by estimating the time
18251debfc3dSmrg for each CPU or memory op in the loop. Time estimates for memory ops
18261debfc3dSmrg should account for cache misses. */
18271debfc3dSmrg insn_to_mem_ratio = ninsns / mem_ref_count;
18281debfc3dSmrg
1829*8feb0f0bSmrg if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
18301debfc3dSmrg {
18311debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
18321debfc3dSmrg fprintf (dump_file,
18331debfc3dSmrg "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
18341debfc3dSmrg insn_to_mem_ratio);
18351debfc3dSmrg return false;
18361debfc3dSmrg }
18371debfc3dSmrg
18381debfc3dSmrg return true;
18391debfc3dSmrg }
18401debfc3dSmrg
18411debfc3dSmrg /* Determine whether or not the instruction to prefetch ratio in the loop is
18421debfc3dSmrg too small based on the profitablity consideration.
18431debfc3dSmrg NINSNS: estimated number of instructions in the loop,
18441debfc3dSmrg PREFETCH_COUNT: an estimate of the number of prefetches,
18451debfc3dSmrg UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
18461debfc3dSmrg
18471debfc3dSmrg static bool
insn_to_prefetch_ratio_too_small_p(unsigned ninsns,unsigned prefetch_count,unsigned unroll_factor)18481debfc3dSmrg insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
18491debfc3dSmrg unsigned unroll_factor)
18501debfc3dSmrg {
18511debfc3dSmrg int insn_to_prefetch_ratio;
18521debfc3dSmrg
18531debfc3dSmrg /* Prefetching most likely causes performance degradation when the instruction
18541debfc3dSmrg to prefetch ratio is too small. Too many prefetch instructions in a loop
18551debfc3dSmrg may reduce the I-cache performance.
18561debfc3dSmrg (unroll_factor * ninsns) is used to estimate the number of instructions in
18571debfc3dSmrg the unrolled loop. This implementation is a bit simplistic -- the number
18581debfc3dSmrg of issued prefetch instructions is also affected by unrolling. So,
18591debfc3dSmrg prefetch_mod and the unroll factor should be taken into account when
18601debfc3dSmrg determining prefetch_count. Also, the number of insns of the unrolled
18611debfc3dSmrg loop will usually be significantly smaller than the number of insns of the
18621debfc3dSmrg original loop * unroll_factor (at least the induction variable increases
18631debfc3dSmrg and the exit branches will get eliminated), so it might be better to use
18641debfc3dSmrg tree_estimate_loop_size + estimated_unrolled_size. */
18651debfc3dSmrg insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1866*8feb0f0bSmrg if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
18671debfc3dSmrg {
18681debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
18691debfc3dSmrg fprintf (dump_file,
18701debfc3dSmrg "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
18711debfc3dSmrg insn_to_prefetch_ratio);
18721debfc3dSmrg return true;
18731debfc3dSmrg }
18741debfc3dSmrg
18751debfc3dSmrg return false;
18761debfc3dSmrg }
18771debfc3dSmrg
18781debfc3dSmrg
18791debfc3dSmrg /* Issue prefetch instructions for array references in LOOP. Returns
18801debfc3dSmrg true if the LOOP was unrolled. */
18811debfc3dSmrg
18821debfc3dSmrg static bool
loop_prefetch_arrays(class loop * loop)1883*8feb0f0bSmrg loop_prefetch_arrays (class loop *loop)
18841debfc3dSmrg {
18851debfc3dSmrg struct mem_ref_group *refs;
18861debfc3dSmrg unsigned ahead, ninsns, time, unroll_factor;
18871debfc3dSmrg HOST_WIDE_INT est_niter;
1888*8feb0f0bSmrg class tree_niter_desc desc;
18891debfc3dSmrg bool unrolled = false, no_other_refs;
18901debfc3dSmrg unsigned prefetch_count;
18911debfc3dSmrg unsigned mem_ref_count;
18921debfc3dSmrg
18931debfc3dSmrg if (optimize_loop_nest_for_size_p (loop))
18941debfc3dSmrg {
18951debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
18961debfc3dSmrg fprintf (dump_file, " ignored (cold area)\n");
18971debfc3dSmrg return false;
18981debfc3dSmrg }
18991debfc3dSmrg
19001debfc3dSmrg /* FIXME: the time should be weighted by the probabilities of the blocks in
19011debfc3dSmrg the loop body. */
19021debfc3dSmrg time = tree_num_loop_insns (loop, &eni_time_weights);
19031debfc3dSmrg if (time == 0)
19041debfc3dSmrg return false;
19051debfc3dSmrg
1906*8feb0f0bSmrg ahead = (param_prefetch_latency + time - 1) / time;
19071debfc3dSmrg est_niter = estimated_stmt_executions_int (loop);
19081debfc3dSmrg if (est_niter == -1)
19091debfc3dSmrg est_niter = likely_max_stmt_executions_int (loop);
19101debfc3dSmrg
19111debfc3dSmrg /* Prefetching is not likely to be profitable if the trip count to ahead
19121debfc3dSmrg ratio is too small. */
19131debfc3dSmrg if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
19141debfc3dSmrg return false;
19151debfc3dSmrg
19161debfc3dSmrg ninsns = tree_num_loop_insns (loop, &eni_size_weights);
19171debfc3dSmrg
19181debfc3dSmrg /* Step 1: gather the memory references. */
19191debfc3dSmrg refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
19201debfc3dSmrg
19211debfc3dSmrg /* Give up prefetching if the number of memory references in the
19221debfc3dSmrg loop is not reasonable based on profitablity and compilation time
19231debfc3dSmrg considerations. */
19241debfc3dSmrg if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
19251debfc3dSmrg goto fail;
19261debfc3dSmrg
19271debfc3dSmrg /* Step 2: estimate the reuse effects. */
19281debfc3dSmrg prune_by_reuse (refs);
19291debfc3dSmrg
19301debfc3dSmrg if (nothing_to_prefetch_p (refs))
19311debfc3dSmrg goto fail;
19321debfc3dSmrg
19331debfc3dSmrg if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
19341debfc3dSmrg goto fail;
19351debfc3dSmrg
19361debfc3dSmrg /* Step 3: determine unroll factor. */
19371debfc3dSmrg unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
19381debfc3dSmrg est_niter);
19391debfc3dSmrg
19401debfc3dSmrg /* Estimate prefetch count for the unrolled loop. */
19411debfc3dSmrg prefetch_count = estimate_prefetch_count (refs, unroll_factor);
19421debfc3dSmrg if (prefetch_count == 0)
19431debfc3dSmrg goto fail;
19441debfc3dSmrg
19451debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
19461debfc3dSmrg fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
19471debfc3dSmrg HOST_WIDE_INT_PRINT_DEC "\n"
19481debfc3dSmrg "insn count %d, mem ref count %d, prefetch count %d\n",
19491debfc3dSmrg ahead, unroll_factor, est_niter,
19501debfc3dSmrg ninsns, mem_ref_count, prefetch_count);
19511debfc3dSmrg
19521debfc3dSmrg /* Prefetching is not likely to be profitable if the instruction to prefetch
19531debfc3dSmrg ratio is too small. */
19541debfc3dSmrg if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
19551debfc3dSmrg unroll_factor))
19561debfc3dSmrg goto fail;
19571debfc3dSmrg
19581debfc3dSmrg mark_nontemporal_stores (loop, refs);
19591debfc3dSmrg
19601debfc3dSmrg /* Step 4: what to prefetch? */
19611debfc3dSmrg if (!schedule_prefetches (refs, unroll_factor, ahead))
19621debfc3dSmrg goto fail;
19631debfc3dSmrg
19641debfc3dSmrg /* Step 5: unroll the loop. TODO -- peeling of first and last few
19651debfc3dSmrg iterations so that we do not issue superfluous prefetches. */
19661debfc3dSmrg if (unroll_factor != 1)
19671debfc3dSmrg {
19681debfc3dSmrg tree_unroll_loop (loop, unroll_factor,
19691debfc3dSmrg single_dom_exit (loop), &desc);
19701debfc3dSmrg unrolled = true;
19711debfc3dSmrg }
19721debfc3dSmrg
19731debfc3dSmrg /* Step 6: issue the prefetches. */
19741debfc3dSmrg issue_prefetches (refs, unroll_factor, ahead);
19751debfc3dSmrg
19761debfc3dSmrg fail:
19771debfc3dSmrg release_mem_refs (refs);
19781debfc3dSmrg return unrolled;
19791debfc3dSmrg }
19801debfc3dSmrg
19811debfc3dSmrg /* Issue prefetch instructions for array references in loops. */
19821debfc3dSmrg
19831debfc3dSmrg unsigned int
tree_ssa_prefetch_arrays(void)19841debfc3dSmrg tree_ssa_prefetch_arrays (void)
19851debfc3dSmrg {
1986*8feb0f0bSmrg class loop *loop;
19871debfc3dSmrg bool unrolled = false;
19881debfc3dSmrg int todo_flags = 0;
19891debfc3dSmrg
19901debfc3dSmrg if (!targetm.have_prefetch ()
19911debfc3dSmrg /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
19921debfc3dSmrg -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
19931debfc3dSmrg of processor costs and i486 does not have prefetch, but
19941debfc3dSmrg -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */
19951debfc3dSmrg || PREFETCH_BLOCK == 0)
19961debfc3dSmrg return 0;
19971debfc3dSmrg
19981debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
19991debfc3dSmrg {
20001debfc3dSmrg fprintf (dump_file, "Prefetching parameters:\n");
20011debfc3dSmrg fprintf (dump_file, " simultaneous prefetches: %d\n",
2002*8feb0f0bSmrg param_simultaneous_prefetches);
2003*8feb0f0bSmrg fprintf (dump_file, " prefetch latency: %d\n", param_prefetch_latency);
20041debfc3dSmrg fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
20051debfc3dSmrg fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
2006*8feb0f0bSmrg L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
2007*8feb0f0bSmrg param_l1_cache_size);
2008*8feb0f0bSmrg fprintf (dump_file, " L1 cache line size: %d\n",
2009*8feb0f0bSmrg param_l1_cache_line_size);
2010*8feb0f0bSmrg fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size);
20111debfc3dSmrg fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
2012*8feb0f0bSmrg param_min_insn_to_prefetch_ratio);
20131debfc3dSmrg fprintf (dump_file, " min insn-to-mem ratio: %d \n",
2014*8feb0f0bSmrg param_prefetch_min_insn_to_mem_ratio);
20151debfc3dSmrg fprintf (dump_file, "\n");
20161debfc3dSmrg }
20171debfc3dSmrg
20181debfc3dSmrg initialize_original_copy_tables ();
20191debfc3dSmrg
20201debfc3dSmrg if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
20211debfc3dSmrg {
20221debfc3dSmrg tree type = build_function_type_list (void_type_node,
20231debfc3dSmrg const_ptr_type_node, NULL_TREE);
20241debfc3dSmrg tree decl = add_builtin_function ("__builtin_prefetch", type,
20251debfc3dSmrg BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
20261debfc3dSmrg NULL, NULL_TREE);
20271debfc3dSmrg DECL_IS_NOVOPS (decl) = true;
20281debfc3dSmrg set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
20291debfc3dSmrg }
20301debfc3dSmrg
20311debfc3dSmrg FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
20321debfc3dSmrg {
20331debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
20341debfc3dSmrg fprintf (dump_file, "Processing loop %d:\n", loop->num);
20351debfc3dSmrg
20361debfc3dSmrg unrolled |= loop_prefetch_arrays (loop);
20371debfc3dSmrg
20381debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
20391debfc3dSmrg fprintf (dump_file, "\n\n");
20401debfc3dSmrg }
20411debfc3dSmrg
20421debfc3dSmrg if (unrolled)
20431debfc3dSmrg {
20441debfc3dSmrg scev_reset ();
20451debfc3dSmrg todo_flags |= TODO_cleanup_cfg;
20461debfc3dSmrg }
20471debfc3dSmrg
20481debfc3dSmrg free_original_copy_tables ();
20491debfc3dSmrg return todo_flags;
20501debfc3dSmrg }
20511debfc3dSmrg
20521debfc3dSmrg /* Prefetching. */
20531debfc3dSmrg
20541debfc3dSmrg namespace {
20551debfc3dSmrg
20561debfc3dSmrg const pass_data pass_data_loop_prefetch =
20571debfc3dSmrg {
20581debfc3dSmrg GIMPLE_PASS, /* type */
20591debfc3dSmrg "aprefetch", /* name */
20601debfc3dSmrg OPTGROUP_LOOP, /* optinfo_flags */
20611debfc3dSmrg TV_TREE_PREFETCH, /* tv_id */
20621debfc3dSmrg ( PROP_cfg | PROP_ssa ), /* properties_required */
20631debfc3dSmrg 0, /* properties_provided */
20641debfc3dSmrg 0, /* properties_destroyed */
20651debfc3dSmrg 0, /* todo_flags_start */
20661debfc3dSmrg 0, /* todo_flags_finish */
20671debfc3dSmrg };
20681debfc3dSmrg
20691debfc3dSmrg class pass_loop_prefetch : public gimple_opt_pass
20701debfc3dSmrg {
20711debfc3dSmrg public:
pass_loop_prefetch(gcc::context * ctxt)20721debfc3dSmrg pass_loop_prefetch (gcc::context *ctxt)
20731debfc3dSmrg : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
20741debfc3dSmrg {}
20751debfc3dSmrg
20761debfc3dSmrg /* opt_pass methods: */
gate(function *)20771debfc3dSmrg virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
20781debfc3dSmrg virtual unsigned int execute (function *);
20791debfc3dSmrg
20801debfc3dSmrg }; // class pass_loop_prefetch
20811debfc3dSmrg
20821debfc3dSmrg unsigned int
execute(function * fun)20831debfc3dSmrg pass_loop_prefetch::execute (function *fun)
20841debfc3dSmrg {
20851debfc3dSmrg if (number_of_loops (fun) <= 1)
20861debfc3dSmrg return 0;
20871debfc3dSmrg
20881debfc3dSmrg if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
20891debfc3dSmrg {
20901debfc3dSmrg static bool warned = false;
20911debfc3dSmrg
20921debfc3dSmrg if (!warned)
20931debfc3dSmrg {
20941debfc3dSmrg warning (OPT_Wdisabled_optimization,
20951debfc3dSmrg "%<l1-cache-size%> parameter is not a power of two %d",
20961debfc3dSmrg PREFETCH_BLOCK);
20971debfc3dSmrg warned = true;
20981debfc3dSmrg }
20991debfc3dSmrg return 0;
21001debfc3dSmrg }
21011debfc3dSmrg
21021debfc3dSmrg return tree_ssa_prefetch_arrays ();
21031debfc3dSmrg }
21041debfc3dSmrg
21051debfc3dSmrg } // anon namespace
21061debfc3dSmrg
21071debfc3dSmrg gimple_opt_pass *
make_pass_loop_prefetch(gcc::context * ctxt)21081debfc3dSmrg make_pass_loop_prefetch (gcc::context *ctxt)
21091debfc3dSmrg {
21101debfc3dSmrg return new pass_loop_prefetch (ctxt);
21111debfc3dSmrg }
21121debfc3dSmrg
21131debfc3dSmrg
2114