xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-ssa-loop-prefetch.c (revision c9496f6b604074a9451a67df576a5b423068e71e)
1 /* Array prefetching.
2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "hash-set.h"
25 #include "machmode.h"
26 #include "vec.h"
27 #include "double-int.h"
28 #include "input.h"
29 #include "alias.h"
30 #include "symtab.h"
31 #include "wide-int.h"
32 #include "inchash.h"
33 #include "tree.h"
34 #include "fold-const.h"
35 #include "stor-layout.h"
36 #include "tm_p.h"
37 #include "predict.h"
38 #include "hard-reg-set.h"
39 #include "function.h"
40 #include "dominance.h"
41 #include "cfg.h"
42 #include "basic-block.h"
43 #include "tree-pretty-print.h"
44 #include "tree-ssa-alias.h"
45 #include "internal-fn.h"
46 #include "gimple-expr.h"
47 #include "is-a.h"
48 #include "gimple.h"
49 #include "gimplify.h"
50 #include "gimple-iterator.h"
51 #include "gimplify-me.h"
52 #include "gimple-ssa.h"
53 #include "tree-ssa-loop-ivopts.h"
54 #include "tree-ssa-loop-manip.h"
55 #include "tree-ssa-loop-niter.h"
56 #include "tree-ssa-loop.h"
57 #include "tree-into-ssa.h"
58 #include "cfgloop.h"
59 #include "tree-pass.h"
60 #include "insn-config.h"
61 #include "tree-chrec.h"
62 #include "tree-scalar-evolution.h"
63 #include "diagnostic-core.h"
64 #include "params.h"
65 #include "langhooks.h"
66 #include "tree-inline.h"
67 #include "tree-data-ref.h"
68 #include "diagnostic-core.h"
69 
70 
71 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
72    between the GIMPLE and RTL worlds.  */
73 #include "hashtab.h"
74 #include "rtl.h"
75 #include "flags.h"
76 #include "statistics.h"
77 #include "real.h"
78 #include "fixed-value.h"
79 #include "expmed.h"
80 #include "dojump.h"
81 #include "explow.h"
82 #include "calls.h"
83 #include "emit-rtl.h"
84 #include "varasm.h"
85 #include "stmt.h"
86 #include "expr.h"
87 #include "insn-codes.h"
88 #include "optabs.h"
89 #include "recog.h"
90 
91 /* This pass inserts prefetch instructions to optimize cache usage during
92    accesses to arrays in loops.  It processes loops sequentially and:
93 
94    1) Gathers all memory references in the single loop.
95    2) For each of the references it decides when it is profitable to prefetch
96       it.  To do it, we evaluate the reuse among the accesses, and determines
97       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
98       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
99       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
100       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
101       (assuming cache line size is 64 bytes, char has size 1 byte and there
102       is no hardware sequential prefetch):
103 
104       char *a;
105       for (i = 0; i < max; i++)
106 	{
107 	  a[255] = ...;		(0)
108 	  a[i] = ...;		(1)
109 	  a[i + 64] = ...;	(2)
110 	  a[16*i] = ...;	(3)
111 	  a[187*i] = ...;	(4)
112 	  a[187*i + 50] = ...;	(5)
113 	}
114 
115        (0) obviously has PREFETCH_BEFORE 1
116        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
117            location 64 iterations before it, and PREFETCH_MOD 64 (since
118 	   it hits the same cache line otherwise).
119        (2) has PREFETCH_MOD 64
120        (3) has PREFETCH_MOD 4
121        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
122            the cache line accessed by (5) is the same with probability only
123 	   7/32.
124        (5) has PREFETCH_MOD 1 as well.
125 
126       Additionally, we use data dependence analysis to determine for each
127       reference the distance till the first reuse; this information is used
128       to determine the temporality of the issued prefetch instruction.
129 
130    3) We determine how much ahead we need to prefetch.  The number of
131       iterations needed is time to fetch / time spent in one iteration of
132       the loop.  The problem is that we do not know either of these values,
133       so we just make a heuristic guess based on a magic (possibly)
134       target-specific constant and size of the loop.
135 
136    4) Determine which of the references we prefetch.  We take into account
137       that there is a maximum number of simultaneous prefetches (provided
138       by machine description).  We prefetch as many prefetches as possible
139       while still within this bound (starting with those with lowest
140       prefetch_mod, since they are responsible for most of the cache
141       misses).
142 
143    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
144       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
145       prefetching nonaccessed memory.
146       TODO -- actually implement peeling.
147 
148    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
149       prefetch instructions with guards in cases where 5) was not sufficient
150       to satisfy the constraints?
151 
152    A cost model is implemented to determine whether or not prefetching is
153    profitable for a given loop.  The cost model has three heuristics:
154 
155    1. Function trip_count_to_ahead_ratio_too_small_p implements a
156       heuristic that determines whether or not the loop has too few
157       iterations (compared to ahead).  Prefetching is not likely to be
158       beneficial if the trip count to ahead ratio is below a certain
159       minimum.
160 
161    2. Function mem_ref_count_reasonable_p implements a heuristic that
162       determines whether the given loop has enough CPU ops that can be
163       overlapped with cache missing memory ops.  If not, the loop
164       won't benefit from prefetching.  In the implementation,
165       prefetching is not considered beneficial if the ratio between
166       the instruction count and the mem ref count is below a certain
167       minimum.
168 
169    3. Function insn_to_prefetch_ratio_too_small_p implements a
170       heuristic that disables prefetching in a loop if the prefetching
171       cost is above a certain limit.  The relative prefetching cost is
172       estimated by taking the ratio between the prefetch count and the
173       total intruction count (this models the I-cache cost).
174 
175    The limits used in these heuristics are defined as parameters with
176    reasonable default values. Machine-specific default values will be
177    added later.
178 
179    Some other TODO:
180       -- write and use more general reuse analysis (that could be also used
181 	 in other cache aimed loop optimizations)
182       -- make it behave sanely together with the prefetches given by user
183 	 (now we just ignore them; at the very least we should avoid
184 	 optimizing loops in that user put his own prefetches)
185       -- we assume cache line size alignment of arrays; this could be
186 	 improved.  */
187 
188 /* Magic constants follow.  These should be replaced by machine specific
189    numbers.  */
190 
191 /* True if write can be prefetched by a read prefetch.  */
192 
193 #ifndef WRITE_CAN_USE_READ_PREFETCH
194 #define WRITE_CAN_USE_READ_PREFETCH 1
195 #endif
196 
197 /* True if read can be prefetched by a write prefetch. */
198 
199 #ifndef READ_CAN_USE_WRITE_PREFETCH
200 #define READ_CAN_USE_WRITE_PREFETCH 0
201 #endif
202 
203 /* The size of the block loaded by a single prefetch.  Usually, this is
204    the same as cache line size (at the moment, we only consider one level
205    of cache hierarchy).  */
206 
207 #ifndef PREFETCH_BLOCK
208 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
209 #endif
210 
211 /* Do we have a forward hardware sequential prefetching?  */
212 
213 #ifndef HAVE_FORWARD_PREFETCH
214 #define HAVE_FORWARD_PREFETCH 0
215 #endif
216 
217 /* Do we have a backward hardware sequential prefetching?  */
218 
219 #ifndef HAVE_BACKWARD_PREFETCH
220 #define HAVE_BACKWARD_PREFETCH 0
221 #endif
222 
223 /* In some cases we are only able to determine that there is a certain
224    probability that the two accesses hit the same cache line.  In this
225    case, we issue the prefetches for both of them if this probability
226    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
227 
228 #ifndef ACCEPTABLE_MISS_RATE
229 #define ACCEPTABLE_MISS_RATE 50
230 #endif
231 
232 #ifndef HAVE_prefetch
233 #define HAVE_prefetch 0
234 #endif
235 
236 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
237 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
238 
239 /* We consider a memory access nontemporal if it is not reused sooner than
240    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
241    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
242    so that we use nontemporal prefetches e.g. if single memory location
243    is accessed several times in a single iteration of the loop.  */
244 #define NONTEMPORAL_FRACTION 16
245 
246 /* In case we have to emit a memory fence instruction after the loop that
247    uses nontemporal stores, this defines the builtin to use.  */
248 
249 #ifndef FENCE_FOLLOWING_MOVNT
250 #define FENCE_FOLLOWING_MOVNT NULL_TREE
251 #endif
252 
253 /* It is not profitable to prefetch when the trip count is not at
254    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
255    For example, in a loop with a prefetch ahead distance of 10,
256    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
257    profitable to prefetch when the trip count is greater or equal to
258    40.  In that case, 30 out of the 40 iterations will benefit from
259    prefetching.  */
260 
261 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
262 #define TRIP_COUNT_TO_AHEAD_RATIO 4
263 #endif
264 
265 /* The group of references between that reuse may occur.  */
266 
267 struct mem_ref_group
268 {
269   tree base;			/* Base of the reference.  */
270   tree step;			/* Step of the reference.  */
271   struct mem_ref *refs;		/* References in the group.  */
272   struct mem_ref_group *next;	/* Next group of references.  */
273 };
274 
275 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
276 
277 #define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
278 
279 /* Do not generate a prefetch if the unroll factor is significantly less
280    than what is required by the prefetch.  This is to avoid redundant
281    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
282    2, prefetching requires unrolling the loop 16 times, but
283    the loop is actually unrolled twice.  In this case (ratio = 8),
284    prefetching is not likely to be beneficial.  */
285 
286 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
287 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
288 #endif
289 
290 /* Some of the prefetch computations have quadratic complexity.  We want to
291    avoid huge compile times and, therefore, want to limit the amount of
292    memory references per loop where we consider prefetching.  */
293 
294 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
295 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
296 #endif
297 
298 /* The memory reference.  */
299 
300 struct mem_ref
301 {
302   gimple stmt;			/* Statement in that the reference appears.  */
303   tree mem;			/* The reference.  */
304   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
305   struct mem_ref_group *group;	/* The group of references it belongs to.  */
306   unsigned HOST_WIDE_INT prefetch_mod;
307 				/* Prefetch only each PREFETCH_MOD-th
308 				   iteration.  */
309   unsigned HOST_WIDE_INT prefetch_before;
310 				/* Prefetch only first PREFETCH_BEFORE
311 				   iterations.  */
312   unsigned reuse_distance;	/* The amount of data accessed before the first
313 				   reuse of this value.  */
314   struct mem_ref *next;		/* The next reference in the group.  */
315   unsigned write_p : 1;		/* Is it a write?  */
316   unsigned independent_p : 1;	/* True if the reference is independent on
317 				   all other references inside the loop.  */
318   unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
319   unsigned storent_p : 1;	/* True if we changed the store to a
320 				   nontemporal one.  */
321 };
322 
323 /* Dumps information about memory reference */
324 static void
325 dump_mem_details (FILE *file, tree base, tree step,
326 	    HOST_WIDE_INT delta, bool write_p)
327 {
328   fprintf (file, "(base ");
329   print_generic_expr (file, base, TDF_SLIM);
330   fprintf (file, ", step ");
331   if (cst_and_fits_in_hwi (step))
332     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
333   else
334     print_generic_expr (file, step, TDF_TREE);
335   fprintf (file, ")\n");
336   fprintf (file, "  delta ");
337   fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
338   fprintf (file, "\n");
339   fprintf (file, "  %s\n", write_p ? "write" : "read");
340   fprintf (file, "\n");
341 }
342 
343 /* Dumps information about reference REF to FILE.  */
344 
345 static void
346 dump_mem_ref (FILE *file, struct mem_ref *ref)
347 {
348   fprintf (file, "Reference %p:\n", (void *) ref);
349 
350   fprintf (file, "  group %p ", (void *) ref->group);
351 
352   dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
353                    ref->write_p);
354 }
355 
356 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
357    exist.  */
358 
359 static struct mem_ref_group *
360 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
361 {
362   struct mem_ref_group *group;
363 
364   for (; *groups; groups = &(*groups)->next)
365     {
366       if (operand_equal_p ((*groups)->step, step, 0)
367 	  && operand_equal_p ((*groups)->base, base, 0))
368 	return *groups;
369 
370       /* If step is an integer constant, keep the list of groups sorted
371          by decreasing step.  */
372         if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
373             && int_cst_value ((*groups)->step) < int_cst_value (step))
374 	break;
375     }
376 
377   group = XNEW (struct mem_ref_group);
378   group->base = base;
379   group->step = step;
380   group->refs = NULL;
381   group->next = *groups;
382   *groups = group;
383 
384   return group;
385 }
386 
387 /* Records a memory reference MEM in GROUP with offset DELTA and write status
388    WRITE_P.  The reference occurs in statement STMT.  */
389 
390 static void
391 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
392 	    HOST_WIDE_INT delta, bool write_p)
393 {
394   struct mem_ref **aref;
395 
396   /* Do not record the same address twice.  */
397   for (aref = &group->refs; *aref; aref = &(*aref)->next)
398     {
399       /* It does not have to be possible for write reference to reuse the read
400 	 prefetch, or vice versa.  */
401       if (!WRITE_CAN_USE_READ_PREFETCH
402 	  && write_p
403 	  && !(*aref)->write_p)
404 	continue;
405       if (!READ_CAN_USE_WRITE_PREFETCH
406 	  && !write_p
407 	  && (*aref)->write_p)
408 	continue;
409 
410       if ((*aref)->delta == delta)
411 	return;
412     }
413 
414   (*aref) = XNEW (struct mem_ref);
415   (*aref)->stmt = stmt;
416   (*aref)->mem = mem;
417   (*aref)->delta = delta;
418   (*aref)->write_p = write_p;
419   (*aref)->prefetch_before = PREFETCH_ALL;
420   (*aref)->prefetch_mod = 1;
421   (*aref)->reuse_distance = 0;
422   (*aref)->issue_prefetch_p = false;
423   (*aref)->group = group;
424   (*aref)->next = NULL;
425   (*aref)->independent_p = false;
426   (*aref)->storent_p = false;
427 
428   if (dump_file && (dump_flags & TDF_DETAILS))
429     dump_mem_ref (dump_file, *aref);
430 }
431 
432 /* Release memory references in GROUPS.  */
433 
434 static void
435 release_mem_refs (struct mem_ref_group *groups)
436 {
437   struct mem_ref_group *next_g;
438   struct mem_ref *ref, *next_r;
439 
440   for (; groups; groups = next_g)
441     {
442       next_g = groups->next;
443       for (ref = groups->refs; ref; ref = next_r)
444 	{
445 	  next_r = ref->next;
446 	  free (ref);
447 	}
448       free (groups);
449     }
450 }
451 
452 /* A structure used to pass arguments to idx_analyze_ref.  */
453 
454 struct ar_data
455 {
456   struct loop *loop;			/* Loop of the reference.  */
457   gimple stmt;				/* Statement of the reference.  */
458   tree *step;				/* Step of the memory reference.  */
459   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
460 };
461 
462 /* Analyzes a single INDEX of a memory reference to obtain information
463    described at analyze_ref.  Callback for for_each_index.  */
464 
465 static bool
466 idx_analyze_ref (tree base, tree *index, void *data)
467 {
468   struct ar_data *ar_data = (struct ar_data *) data;
469   tree ibase, step, stepsize;
470   HOST_WIDE_INT idelta = 0, imult = 1;
471   affine_iv iv;
472 
473   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
474 		  *index, &iv, true))
475     return false;
476   ibase = iv.base;
477   step = iv.step;
478 
479   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
480       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
481     {
482       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
483       ibase = TREE_OPERAND (ibase, 0);
484     }
485   if (cst_and_fits_in_hwi (ibase))
486     {
487       idelta += int_cst_value (ibase);
488       ibase = build_int_cst (TREE_TYPE (ibase), 0);
489     }
490 
491   if (TREE_CODE (base) == ARRAY_REF)
492     {
493       stepsize = array_ref_element_size (base);
494       if (!cst_and_fits_in_hwi (stepsize))
495 	return false;
496       imult = int_cst_value (stepsize);
497       step = fold_build2 (MULT_EXPR, sizetype,
498 			  fold_convert (sizetype, step),
499 			  fold_convert (sizetype, stepsize));
500       idelta *= imult;
501     }
502 
503   if (*ar_data->step == NULL_TREE)
504     *ar_data->step = step;
505   else
506     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
507 				  fold_convert (sizetype, *ar_data->step),
508 				  fold_convert (sizetype, step));
509   *ar_data->delta += idelta;
510   *index = ibase;
511 
512   return true;
513 }
514 
515 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
516    STEP are integer constants and iter is number of iterations of LOOP.  The
517    reference occurs in statement STMT.  Strips nonaddressable component
518    references from REF_P.  */
519 
520 static bool
521 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
522 	     tree *step, HOST_WIDE_INT *delta,
523 	     gimple stmt)
524 {
525   struct ar_data ar_data;
526   tree off;
527   HOST_WIDE_INT bit_offset;
528   tree ref = *ref_p;
529 
530   *step = NULL_TREE;
531   *delta = 0;
532 
533   /* First strip off the component references.  Ignore bitfields.
534      Also strip off the real and imagine parts of a complex, so that
535      they can have the same base.  */
536   if (TREE_CODE (ref) == REALPART_EXPR
537       || TREE_CODE (ref) == IMAGPART_EXPR
538       || (TREE_CODE (ref) == COMPONENT_REF
539           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
540     {
541       if (TREE_CODE (ref) == IMAGPART_EXPR)
542         *delta += int_size_in_bytes (TREE_TYPE (ref));
543       ref = TREE_OPERAND (ref, 0);
544     }
545 
546   *ref_p = ref;
547 
548   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
549     {
550       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
551       bit_offset = TREE_INT_CST_LOW (off);
552       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
553 
554       *delta += bit_offset / BITS_PER_UNIT;
555     }
556 
557   *base = unshare_expr (ref);
558   ar_data.loop = loop;
559   ar_data.stmt = stmt;
560   ar_data.step = step;
561   ar_data.delta = delta;
562   return for_each_index (base, idx_analyze_ref, &ar_data);
563 }
564 
565 /* Record a memory reference REF to the list REFS.  The reference occurs in
566    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
567    reference was recorded, false otherwise.  */
568 
569 static bool
570 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
571 			      tree ref, bool write_p, gimple stmt)
572 {
573   tree base, step;
574   HOST_WIDE_INT delta;
575   struct mem_ref_group *agrp;
576 
577   if (get_base_address (ref) == NULL)
578     return false;
579 
580   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
581     return false;
582   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
583   if (step == NULL_TREE)
584     return false;
585 
586   /* Stop if the address of BASE could not be taken.  */
587   if (may_be_nonaddressable_p (base))
588     return false;
589 
590   /* Limit non-constant step prefetching only to the innermost loops and
591      only when the step is loop invariant in the entire loop nest. */
592   if (!cst_and_fits_in_hwi (step))
593     {
594       if (loop->inner != NULL)
595         {
596           if (dump_file && (dump_flags & TDF_DETAILS))
597             {
598               fprintf (dump_file, "Memory expression %p\n",(void *) ref );
599               print_generic_expr (dump_file, ref, TDF_TREE);
600               fprintf (dump_file,":");
601               dump_mem_details (dump_file, base, step, delta, write_p);
602               fprintf (dump_file,
603                        "Ignoring %p, non-constant step prefetching is "
604                        "limited to inner most loops \n",
605                        (void *) ref);
606             }
607             return false;
608          }
609       else
610         {
611           if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
612           {
613             if (dump_file && (dump_flags & TDF_DETAILS))
614               {
615                 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
616                 print_generic_expr (dump_file, ref, TDF_TREE);
617                 fprintf (dump_file,":");
618                 dump_mem_details (dump_file, base, step, delta, write_p);
619                 fprintf (dump_file,
620                          "Not prefetching, ignoring %p due to "
621                          "loop variant step\n",
622                          (void *) ref);
623               }
624               return false;
625             }
626         }
627     }
628 
629   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
630      are integer constants.  */
631   agrp = find_or_create_group (refs, base, step);
632   record_ref (agrp, stmt, ref, delta, write_p);
633 
634   return true;
635 }
636 
637 /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
638    true if there are no other memory references inside the loop.  */
639 
640 static struct mem_ref_group *
641 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
642 {
643   basic_block *body = get_loop_body_in_dom_order (loop);
644   basic_block bb;
645   unsigned i;
646   gimple_stmt_iterator bsi;
647   gimple stmt;
648   tree lhs, rhs;
649   struct mem_ref_group *refs = NULL;
650 
651   *no_other_refs = true;
652   *ref_count = 0;
653 
654   /* Scan the loop body in order, so that the former references precede the
655      later ones.  */
656   for (i = 0; i < loop->num_nodes; i++)
657     {
658       bb = body[i];
659       if (bb->loop_father != loop)
660 	continue;
661 
662       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
663 	{
664 	  stmt = gsi_stmt (bsi);
665 
666 	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
667 	    {
668 	      if (gimple_vuse (stmt)
669 		  || (is_gimple_call (stmt)
670 		      && !(gimple_call_flags (stmt) & ECF_CONST)))
671 		*no_other_refs = false;
672 	      continue;
673 	    }
674 
675 	  lhs = gimple_assign_lhs (stmt);
676 	  rhs = gimple_assign_rhs1 (stmt);
677 
678 	  if (REFERENCE_CLASS_P (rhs))
679 	    {
680 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
681 							    rhs, false, stmt);
682 	    *ref_count += 1;
683 	    }
684 	  if (REFERENCE_CLASS_P (lhs))
685 	    {
686 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
687 							    lhs, true, stmt);
688 	    *ref_count += 1;
689 	    }
690 	}
691     }
692   free (body);
693 
694   return refs;
695 }
696 
697 /* Prune the prefetch candidate REF using the self-reuse.  */
698 
699 static void
700 prune_ref_by_self_reuse (struct mem_ref *ref)
701 {
702   HOST_WIDE_INT step;
703   bool backward;
704 
705   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
706   if (!cst_and_fits_in_hwi (ref->group->step))
707     return;
708 
709   step = int_cst_value (ref->group->step);
710 
711   backward = step < 0;
712 
713   if (step == 0)
714     {
715       /* Prefetch references to invariant address just once.  */
716       ref->prefetch_before = 1;
717       return;
718     }
719 
720   if (backward)
721     step = -step;
722 
723   if (step > PREFETCH_BLOCK)
724     return;
725 
726   if ((backward && HAVE_BACKWARD_PREFETCH)
727       || (!backward && HAVE_FORWARD_PREFETCH))
728     {
729       ref->prefetch_before = 1;
730       return;
731     }
732 
733   ref->prefetch_mod = PREFETCH_BLOCK / step;
734 }
735 
736 /* Divides X by BY, rounding down.  */
737 
738 static HOST_WIDE_INT
739 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
740 {
741   gcc_assert (by > 0);
742 
743   if (x >= 0)
744     return x / by;
745   else
746     return (x + by - 1) / by;
747 }
748 
749 /* Given a CACHE_LINE_SIZE and two inductive memory references
750    with a common STEP greater than CACHE_LINE_SIZE and an address
751    difference DELTA, compute the probability that they will fall
752    in different cache lines.  Return true if the computed miss rate
753    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
754    number of distinct iterations after which the pattern repeats itself.
755    ALIGN_UNIT is the unit of alignment in bytes.  */
756 
757 static bool
758 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
759 		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
760 		   unsigned HOST_WIDE_INT distinct_iters,
761 		   int align_unit)
762 {
763   unsigned align, iter;
764   int total_positions, miss_positions, max_allowed_miss_positions;
765   int address1, address2, cache_line1, cache_line2;
766 
767   /* It always misses if delta is greater than or equal to the cache
768      line size.  */
769   if (delta >= (HOST_WIDE_INT) cache_line_size)
770     return false;
771 
772   miss_positions = 0;
773   total_positions = (cache_line_size / align_unit) * distinct_iters;
774   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
775 
776   /* Iterate through all possible alignments of the first
777      memory reference within its cache line.  */
778   for (align = 0; align < cache_line_size; align += align_unit)
779 
780     /* Iterate through all distinct iterations.  */
781     for (iter = 0; iter < distinct_iters; iter++)
782       {
783 	address1 = align + step * iter;
784 	address2 = address1 + delta;
785 	cache_line1 = address1 / cache_line_size;
786 	cache_line2 = address2 / cache_line_size;
787 	if (cache_line1 != cache_line2)
788 	  {
789 	    miss_positions += 1;
790             if (miss_positions > max_allowed_miss_positions)
791 	      return false;
792           }
793       }
794   return true;
795 }
796 
797 /* Prune the prefetch candidate REF using the reuse with BY.
798    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
799 
800 static void
801 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
802 			  bool by_is_before)
803 {
804   HOST_WIDE_INT step;
805   bool backward;
806   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
807   HOST_WIDE_INT delta = delta_b - delta_r;
808   HOST_WIDE_INT hit_from;
809   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
810   HOST_WIDE_INT reduced_step;
811   unsigned HOST_WIDE_INT reduced_prefetch_block;
812   tree ref_type;
813   int align_unit;
814 
815   /* If the step is non constant we cannot calculate prefetch_before.  */
816   if (!cst_and_fits_in_hwi (ref->group->step)) {
817     return;
818   }
819 
820   step = int_cst_value (ref->group->step);
821 
822   backward = step < 0;
823 
824 
825   if (delta == 0)
826     {
827       /* If the references has the same address, only prefetch the
828 	 former.  */
829       if (by_is_before)
830 	ref->prefetch_before = 0;
831 
832       return;
833     }
834 
835   if (!step)
836     {
837       /* If the reference addresses are invariant and fall into the
838 	 same cache line, prefetch just the first one.  */
839       if (!by_is_before)
840 	return;
841 
842       if (ddown (ref->delta, PREFETCH_BLOCK)
843 	  != ddown (by->delta, PREFETCH_BLOCK))
844 	return;
845 
846       ref->prefetch_before = 0;
847       return;
848     }
849 
850   /* Only prune the reference that is behind in the array.  */
851   if (backward)
852     {
853       if (delta > 0)
854 	return;
855 
856       /* Transform the data so that we may assume that the accesses
857 	 are forward.  */
858       delta = - delta;
859       step = -step;
860       delta_r = PREFETCH_BLOCK - 1 - delta_r;
861       delta_b = PREFETCH_BLOCK - 1 - delta_b;
862     }
863   else
864     {
865       if (delta < 0)
866 	return;
867     }
868 
869   /* Check whether the two references are likely to hit the same cache
870      line, and how distant the iterations in that it occurs are from
871      each other.  */
872 
873   if (step <= PREFETCH_BLOCK)
874     {
875       /* The accesses are sure to meet.  Let us check when.  */
876       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
877       prefetch_before = (hit_from - delta_r + step - 1) / step;
878 
879       /* Do not reduce prefetch_before if we meet beyond cache size.  */
880       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
881         prefetch_before = PREFETCH_ALL;
882       if (prefetch_before < ref->prefetch_before)
883 	ref->prefetch_before = prefetch_before;
884 
885       return;
886     }
887 
888   /* A more complicated case with step > prefetch_block.  First reduce
889      the ratio between the step and the cache line size to its simplest
890      terms.  The resulting denominator will then represent the number of
891      distinct iterations after which each address will go back to its
892      initial location within the cache line.  This computation assumes
893      that PREFETCH_BLOCK is a power of two.  */
894   prefetch_block = PREFETCH_BLOCK;
895   reduced_prefetch_block = prefetch_block;
896   reduced_step = step;
897   while ((reduced_step & 1) == 0
898 	 && reduced_prefetch_block > 1)
899     {
900       reduced_step >>= 1;
901       reduced_prefetch_block >>= 1;
902     }
903 
904   prefetch_before = delta / step;
905   delta %= step;
906   ref_type = TREE_TYPE (ref->mem);
907   align_unit = TYPE_ALIGN (ref_type) / 8;
908   if (is_miss_rate_acceptable (prefetch_block, step, delta,
909 			       reduced_prefetch_block, align_unit))
910     {
911       /* Do not reduce prefetch_before if we meet beyond cache size.  */
912       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
913         prefetch_before = PREFETCH_ALL;
914       if (prefetch_before < ref->prefetch_before)
915 	ref->prefetch_before = prefetch_before;
916 
917       return;
918     }
919 
920   /* Try also the following iteration.  */
921   prefetch_before++;
922   delta = step - delta;
923   if (is_miss_rate_acceptable (prefetch_block, step, delta,
924 			       reduced_prefetch_block, align_unit))
925     {
926       if (prefetch_before < ref->prefetch_before)
927 	ref->prefetch_before = prefetch_before;
928 
929       return;
930     }
931 
932   /* The ref probably does not reuse by.  */
933   return;
934 }
935 
936 /* Prune the prefetch candidate REF using the reuses with other references
937    in REFS.  */
938 
939 static void
940 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
941 {
942   struct mem_ref *prune_by;
943   bool before = true;
944 
945   prune_ref_by_self_reuse (ref);
946 
947   for (prune_by = refs; prune_by; prune_by = prune_by->next)
948     {
949       if (prune_by == ref)
950 	{
951 	  before = false;
952 	  continue;
953 	}
954 
955       if (!WRITE_CAN_USE_READ_PREFETCH
956 	  && ref->write_p
957 	  && !prune_by->write_p)
958 	continue;
959       if (!READ_CAN_USE_WRITE_PREFETCH
960 	  && !ref->write_p
961 	  && prune_by->write_p)
962 	continue;
963 
964       prune_ref_by_group_reuse (ref, prune_by, before);
965     }
966 }
967 
968 /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
969 
970 static void
971 prune_group_by_reuse (struct mem_ref_group *group)
972 {
973   struct mem_ref *ref_pruned;
974 
975   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
976     {
977       prune_ref_by_reuse (ref_pruned, group->refs);
978 
979       if (dump_file && (dump_flags & TDF_DETAILS))
980 	{
981 	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
982 
983 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
984 	      && ref_pruned->prefetch_mod == 1)
985 	    fprintf (dump_file, " no restrictions");
986 	  else if (ref_pruned->prefetch_before == 0)
987 	    fprintf (dump_file, " do not prefetch");
988 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
989 	    fprintf (dump_file, " prefetch once");
990 	  else
991 	    {
992 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
993 		{
994 		  fprintf (dump_file, " prefetch before ");
995 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
996 			   ref_pruned->prefetch_before);
997 		}
998 	      if (ref_pruned->prefetch_mod != 1)
999 		{
1000 		  fprintf (dump_file, " prefetch mod ");
1001 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
1002 			   ref_pruned->prefetch_mod);
1003 		}
1004 	    }
1005 	  fprintf (dump_file, "\n");
1006 	}
1007     }
1008 }
1009 
1010 /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
1011 
1012 static void
1013 prune_by_reuse (struct mem_ref_group *groups)
1014 {
1015   for (; groups; groups = groups->next)
1016     prune_group_by_reuse (groups);
1017 }
1018 
1019 /* Returns true if we should issue prefetch for REF.  */
1020 
1021 static bool
1022 should_issue_prefetch_p (struct mem_ref *ref)
1023 {
1024   /* For now do not issue prefetches for only first few of the
1025      iterations.  */
1026   if (ref->prefetch_before != PREFETCH_ALL)
1027     {
1028       if (dump_file && (dump_flags & TDF_DETAILS))
1029         fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
1030 		 (void *) ref);
1031       return false;
1032     }
1033 
1034   /* Do not prefetch nontemporal stores.  */
1035   if (ref->storent_p)
1036     {
1037       if (dump_file && (dump_flags & TDF_DETAILS))
1038         fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1039       return false;
1040     }
1041 
1042   return true;
1043 }
1044 
1045 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1046    AHEAD is the number of iterations to prefetch ahead (which corresponds
1047    to the number of simultaneous instances of one prefetch running at a
1048    time).  UNROLL_FACTOR is the factor by that the loop is going to be
1049    unrolled.  Returns true if there is anything to prefetch.  */
1050 
1051 static bool
1052 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1053 		     unsigned ahead)
1054 {
1055   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1056   unsigned slots_per_prefetch;
1057   struct mem_ref *ref;
1058   bool any = false;
1059 
1060   /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
1061   remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1062 
1063   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1064      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
1065      it will need a prefetch slot.  */
1066   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1067   if (dump_file && (dump_flags & TDF_DETAILS))
1068     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1069 	     slots_per_prefetch);
1070 
1071   /* For now we just take memory references one by one and issue
1072      prefetches for as many as possible.  The groups are sorted
1073      starting with the largest step, since the references with
1074      large step are more likely to cause many cache misses.  */
1075 
1076   for (; groups; groups = groups->next)
1077     for (ref = groups->refs; ref; ref = ref->next)
1078       {
1079 	if (!should_issue_prefetch_p (ref))
1080 	  continue;
1081 
1082         /* The loop is far from being sufficiently unrolled for this
1083            prefetch.  Do not generate prefetch to avoid many redudant
1084            prefetches.  */
1085         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1086           continue;
1087 
1088 	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
1089 	   and we unroll the loop UNROLL_FACTOR times, we need to insert
1090 	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1091 	   iteration.  */
1092 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1093 			/ ref->prefetch_mod);
1094 	prefetch_slots = n_prefetches * slots_per_prefetch;
1095 
1096 	/* If more than half of the prefetches would be lost anyway, do not
1097 	   issue the prefetch.  */
1098 	if (2 * remaining_prefetch_slots < prefetch_slots)
1099 	  continue;
1100 
1101 	ref->issue_prefetch_p = true;
1102 
1103 	if (remaining_prefetch_slots <= prefetch_slots)
1104 	  return true;
1105 	remaining_prefetch_slots -= prefetch_slots;
1106 	any = true;
1107       }
1108 
1109   return any;
1110 }
1111 
1112 /* Return TRUE if no prefetch is going to be generated in the given
1113    GROUPS.  */
1114 
1115 static bool
1116 nothing_to_prefetch_p (struct mem_ref_group *groups)
1117 {
1118   struct mem_ref *ref;
1119 
1120   for (; groups; groups = groups->next)
1121     for (ref = groups->refs; ref; ref = ref->next)
1122       if (should_issue_prefetch_p (ref))
1123 	return false;
1124 
1125   return true;
1126 }
1127 
1128 /* Estimate the number of prefetches in the given GROUPS.
1129    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1130 
1131 static int
1132 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1133 {
1134   struct mem_ref *ref;
1135   unsigned n_prefetches;
1136   int prefetch_count = 0;
1137 
1138   for (; groups; groups = groups->next)
1139     for (ref = groups->refs; ref; ref = ref->next)
1140       if (should_issue_prefetch_p (ref))
1141 	{
1142 	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1143 			  / ref->prefetch_mod);
1144 	  prefetch_count += n_prefetches;
1145 	}
1146 
1147   return prefetch_count;
1148 }
1149 
1150 /* Issue prefetches for the reference REF into loop as decided before.
1151    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1152    is the factor by which LOOP was unrolled.  */
1153 
1154 static void
1155 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1156 {
1157   HOST_WIDE_INT delta;
1158   tree addr, addr_base, write_p, local, forward;
1159   gcall *prefetch;
1160   gimple_stmt_iterator bsi;
1161   unsigned n_prefetches, ap;
1162   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1163 
1164   if (dump_file && (dump_flags & TDF_DETAILS))
1165     fprintf (dump_file, "Issued%s prefetch for %p.\n",
1166 	     nontemporal ? " nontemporal" : "",
1167 	     (void *) ref);
1168 
1169   bsi = gsi_for_stmt (ref->stmt);
1170 
1171   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1172 		  / ref->prefetch_mod);
1173   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1174   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1175 					true, NULL, true, GSI_SAME_STMT);
1176   write_p = ref->write_p ? integer_one_node : integer_zero_node;
1177   local = nontemporal ? integer_zero_node : integer_three_node;
1178 
1179   for (ap = 0; ap < n_prefetches; ap++)
1180     {
1181       if (cst_and_fits_in_hwi (ref->group->step))
1182         {
1183           /* Determine the address to prefetch.  */
1184           delta = (ahead + ap * ref->prefetch_mod) *
1185 		   int_cst_value (ref->group->step);
1186           addr = fold_build_pointer_plus_hwi (addr_base, delta);
1187           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1188                                            true, GSI_SAME_STMT);
1189         }
1190       else
1191         {
1192           /* The step size is non-constant but loop-invariant.  We use the
1193              heuristic to simply prefetch ahead iterations ahead.  */
1194           forward = fold_build2 (MULT_EXPR, sizetype,
1195                                  fold_convert (sizetype, ref->group->step),
1196                                  fold_convert (sizetype, size_int (ahead)));
1197           addr = fold_build_pointer_plus (addr_base, forward);
1198           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1199 					   NULL, true, GSI_SAME_STMT);
1200       }
1201       /* Create the prefetch instruction.  */
1202       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1203 				    3, addr, write_p, local);
1204       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1205     }
1206 }
1207 
1208 /* Issue prefetches for the references in GROUPS into loop as decided before.
1209    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1210    factor by that LOOP was unrolled.  */
1211 
1212 static void
1213 issue_prefetches (struct mem_ref_group *groups,
1214 		  unsigned unroll_factor, unsigned ahead)
1215 {
1216   struct mem_ref *ref;
1217 
1218   for (; groups; groups = groups->next)
1219     for (ref = groups->refs; ref; ref = ref->next)
1220       if (ref->issue_prefetch_p)
1221 	issue_prefetch_ref (ref, unroll_factor, ahead);
1222 }
1223 
1224 /* Returns true if REF is a memory write for that a nontemporal store insn
1225    can be used.  */
1226 
1227 static bool
1228 nontemporal_store_p (struct mem_ref *ref)
1229 {
1230   machine_mode mode;
1231   enum insn_code code;
1232 
1233   /* REF must be a write that is not reused.  We require it to be independent
1234      on all other memory references in the loop, as the nontemporal stores may
1235      be reordered with respect to other memory references.  */
1236   if (!ref->write_p
1237       || !ref->independent_p
1238       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1239     return false;
1240 
1241   /* Check that we have the storent instruction for the mode.  */
1242   mode = TYPE_MODE (TREE_TYPE (ref->mem));
1243   if (mode == BLKmode)
1244     return false;
1245 
1246   code = optab_handler (storent_optab, mode);
1247   return code != CODE_FOR_nothing;
1248 }
1249 
1250 /* If REF is a nontemporal store, we mark the corresponding modify statement
1251    and return true.  Otherwise, we return false.  */
1252 
1253 static bool
1254 mark_nontemporal_store (struct mem_ref *ref)
1255 {
1256   if (!nontemporal_store_p (ref))
1257     return false;
1258 
1259   if (dump_file && (dump_flags & TDF_DETAILS))
1260     fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1261 	     (void *) ref);
1262 
1263   gimple_assign_set_nontemporal_move (ref->stmt, true);
1264   ref->storent_p = true;
1265 
1266   return true;
1267 }
1268 
1269 /* Issue a memory fence instruction after LOOP.  */
1270 
1271 static void
1272 emit_mfence_after_loop (struct loop *loop)
1273 {
1274   vec<edge> exits = get_loop_exit_edges (loop);
1275   edge exit;
1276   gcall *call;
1277   gimple_stmt_iterator bsi;
1278   unsigned i;
1279 
1280   FOR_EACH_VEC_ELT (exits, i, exit)
1281     {
1282       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1283 
1284       if (!single_pred_p (exit->dest)
1285 	  /* If possible, we prefer not to insert the fence on other paths
1286 	     in cfg.  */
1287 	  && !(exit->flags & EDGE_ABNORMAL))
1288 	split_loop_exit_edge (exit);
1289       bsi = gsi_after_labels (exit->dest);
1290 
1291       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1292     }
1293 
1294   exits.release ();
1295   update_ssa (TODO_update_ssa_only_virtuals);
1296 }
1297 
1298 /* Returns true if we can use storent in loop, false otherwise.  */
1299 
1300 static bool
1301 may_use_storent_in_loop_p (struct loop *loop)
1302 {
1303   bool ret = true;
1304 
1305   if (loop->inner != NULL)
1306     return false;
1307 
1308   /* If we must issue a mfence insn after using storent, check that there
1309      is a suitable place for it at each of the loop exits.  */
1310   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1311     {
1312       vec<edge> exits = get_loop_exit_edges (loop);
1313       unsigned i;
1314       edge exit;
1315 
1316       FOR_EACH_VEC_ELT (exits, i, exit)
1317 	if ((exit->flags & EDGE_ABNORMAL)
1318 	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1319 	  ret = false;
1320 
1321       exits.release ();
1322     }
1323 
1324   return ret;
1325 }
1326 
1327 /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1328    references in the loop.  */
1329 
1330 static void
1331 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1332 {
1333   struct mem_ref *ref;
1334   bool any = false;
1335 
1336   if (!may_use_storent_in_loop_p (loop))
1337     return;
1338 
1339   for (; groups; groups = groups->next)
1340     for (ref = groups->refs; ref; ref = ref->next)
1341       any |= mark_nontemporal_store (ref);
1342 
1343   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1344     emit_mfence_after_loop (loop);
1345 }
1346 
1347 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1348    this is the case, fill in DESC by the description of number of
1349    iterations.  */
1350 
1351 static bool
1352 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1353 		      unsigned factor)
1354 {
1355   if (!can_unroll_loop_p (loop, factor, desc))
1356     return false;
1357 
1358   /* We only consider loops without control flow for unrolling.  This is not
1359      a hard restriction -- tree_unroll_loop works with arbitrary loops
1360      as well; but the unrolling/prefetching is usually more profitable for
1361      loops consisting of a single basic block, and we want to limit the
1362      code growth.  */
1363   if (loop->num_nodes > 2)
1364     return false;
1365 
1366   return true;
1367 }
1368 
1369 /* Determine the coefficient by that unroll LOOP, from the information
1370    contained in the list of memory references REFS.  Description of
1371    umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1372    insns of the LOOP.  EST_NITER is the estimated number of iterations of
1373    the loop, or -1 if no estimate is available.  */
1374 
1375 static unsigned
1376 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1377 			 unsigned ninsns, struct tree_niter_desc *desc,
1378 			 HOST_WIDE_INT est_niter)
1379 {
1380   unsigned upper_bound;
1381   unsigned nfactor, factor, mod_constraint;
1382   struct mem_ref_group *agp;
1383   struct mem_ref *ref;
1384 
1385   /* First check whether the loop is not too large to unroll.  We ignore
1386      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1387      from unrolling them enough to make exactly one cache line covered by each
1388      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1389      us from unrolling the loops too many times in cases where we only expect
1390      gains from better scheduling and decreasing loop overhead, which is not
1391      the case here.  */
1392   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1393 
1394   /* If we unrolled the loop more times than it iterates, the unrolled version
1395      of the loop would be never entered.  */
1396   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1397     upper_bound = est_niter;
1398 
1399   if (upper_bound <= 1)
1400     return 1;
1401 
1402   /* Choose the factor so that we may prefetch each cache just once,
1403      but bound the unrolling by UPPER_BOUND.  */
1404   factor = 1;
1405   for (agp = refs; agp; agp = agp->next)
1406     for (ref = agp->refs; ref; ref = ref->next)
1407       if (should_issue_prefetch_p (ref))
1408 	{
1409 	  mod_constraint = ref->prefetch_mod;
1410 	  nfactor = least_common_multiple (mod_constraint, factor);
1411 	  if (nfactor <= upper_bound)
1412 	    factor = nfactor;
1413 	}
1414 
1415   if (!should_unroll_loop_p (loop, desc, factor))
1416     return 1;
1417 
1418   return factor;
1419 }
1420 
1421 /* Returns the total volume of the memory references REFS, taking into account
1422    reuses in the innermost loop and cache line size.  TODO -- we should also
1423    take into account reuses across the iterations of the loops in the loop
1424    nest.  */
1425 
1426 static unsigned
1427 volume_of_references (struct mem_ref_group *refs)
1428 {
1429   unsigned volume = 0;
1430   struct mem_ref_group *gr;
1431   struct mem_ref *ref;
1432 
1433   for (gr = refs; gr; gr = gr->next)
1434     for (ref = gr->refs; ref; ref = ref->next)
1435       {
1436 	/* Almost always reuses another value?  */
1437 	if (ref->prefetch_before != PREFETCH_ALL)
1438 	  continue;
1439 
1440 	/* If several iterations access the same cache line, use the size of
1441 	   the line divided by this number.  Otherwise, a cache line is
1442 	   accessed in each iteration.  TODO -- in the latter case, we should
1443 	   take the size of the reference into account, rounding it up on cache
1444 	   line size multiple.  */
1445 	volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1446       }
1447   return volume;
1448 }
1449 
1450 /* Returns the volume of memory references accessed across VEC iterations of
1451    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1452    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1453 
1454 static unsigned
1455 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1456 {
1457   unsigned i;
1458 
1459   for (i = 0; i < n; i++)
1460     if (vec[i] != 0)
1461       break;
1462 
1463   if (i == n)
1464     return 0;
1465 
1466   gcc_assert (vec[i] > 0);
1467 
1468   /* We ignore the parts of the distance vector in subloops, since usually
1469      the numbers of iterations are much smaller.  */
1470   return loop_sizes[i] * vec[i];
1471 }
1472 
1473 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1474    at the position corresponding to the loop of the step.  N is the depth
1475    of the considered loop nest, and, LOOP is its innermost loop.  */
1476 
1477 static void
1478 add_subscript_strides (tree access_fn, unsigned stride,
1479 		       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1480 {
1481   struct loop *aloop;
1482   tree step;
1483   HOST_WIDE_INT astep;
1484   unsigned min_depth = loop_depth (loop) - n;
1485 
1486   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1487     {
1488       aloop = get_chrec_loop (access_fn);
1489       step = CHREC_RIGHT (access_fn);
1490       access_fn = CHREC_LEFT (access_fn);
1491 
1492       if ((unsigned) loop_depth (aloop) <= min_depth)
1493 	continue;
1494 
1495       if (tree_fits_shwi_p (step))
1496 	astep = tree_to_shwi (step);
1497       else
1498 	astep = L1_CACHE_LINE_SIZE;
1499 
1500       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1501 
1502     }
1503 }
1504 
1505 /* Returns the volume of memory references accessed between two consecutive
1506    self-reuses of the reference DR.  We consider the subscripts of DR in N
1507    loops, and LOOP_SIZES contains the volumes of accesses in each of the
1508    loops.  LOOP is the innermost loop of the current loop nest.  */
1509 
1510 static unsigned
1511 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1512 		     struct loop *loop)
1513 {
1514   tree stride, access_fn;
1515   HOST_WIDE_INT *strides, astride;
1516   vec<tree> access_fns;
1517   tree ref = DR_REF (dr);
1518   unsigned i, ret = ~0u;
1519 
1520   /* In the following example:
1521 
1522      for (i = 0; i < N; i++)
1523        for (j = 0; j < N; j++)
1524          use (a[j][i]);
1525      the same cache line is accessed each N steps (except if the change from
1526      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1527      we cannot rely purely on the results of the data dependence analysis.
1528 
1529      Instead, we compute the stride of the reference in each loop, and consider
1530      the innermost loop in that the stride is less than cache size.  */
1531 
1532   strides = XCNEWVEC (HOST_WIDE_INT, n);
1533   access_fns = DR_ACCESS_FNS (dr);
1534 
1535   FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1536     {
1537       /* Keep track of the reference corresponding to the subscript, so that we
1538 	 know its stride.  */
1539       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1540 	ref = TREE_OPERAND (ref, 0);
1541 
1542       if (TREE_CODE (ref) == ARRAY_REF)
1543 	{
1544 	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1545 	  if (tree_fits_uhwi_p (stride))
1546 	    astride = tree_to_uhwi (stride);
1547 	  else
1548 	    astride = L1_CACHE_LINE_SIZE;
1549 
1550 	  ref = TREE_OPERAND (ref, 0);
1551 	}
1552       else
1553 	astride = 1;
1554 
1555       add_subscript_strides (access_fn, astride, strides, n, loop);
1556     }
1557 
1558   for (i = n; i-- > 0; )
1559     {
1560       unsigned HOST_WIDE_INT s;
1561 
1562       s = strides[i] < 0 ?  -strides[i] : strides[i];
1563 
1564       if (s < (unsigned) L1_CACHE_LINE_SIZE
1565 	  && (loop_sizes[i]
1566 	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1567 	{
1568 	  ret = loop_sizes[i];
1569 	  break;
1570 	}
1571     }
1572 
1573   free (strides);
1574   return ret;
1575 }
1576 
1577 /* Determines the distance till the first reuse of each reference in REFS
1578    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1579    memory references in the loop.  Return false if the analysis fails.  */
1580 
1581 static bool
1582 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1583 			   bool no_other_refs)
1584 {
1585   struct loop *nest, *aloop;
1586   vec<data_reference_p> datarefs = vNULL;
1587   vec<ddr_p> dependences = vNULL;
1588   struct mem_ref_group *gr;
1589   struct mem_ref *ref, *refb;
1590   vec<loop_p> vloops = vNULL;
1591   unsigned *loop_data_size;
1592   unsigned i, j, n;
1593   unsigned volume, dist, adist;
1594   HOST_WIDE_INT vol;
1595   data_reference_p dr;
1596   ddr_p dep;
1597 
1598   if (loop->inner)
1599     return true;
1600 
1601   /* Find the outermost loop of the loop nest of loop (we require that
1602      there are no sibling loops inside the nest).  */
1603   nest = loop;
1604   while (1)
1605     {
1606       aloop = loop_outer (nest);
1607 
1608       if (aloop == current_loops->tree_root
1609 	  || aloop->inner->next)
1610 	break;
1611 
1612       nest = aloop;
1613     }
1614 
1615   /* For each loop, determine the amount of data accessed in each iteration.
1616      We use this to estimate whether the reference is evicted from the
1617      cache before its reuse.  */
1618   find_loop_nest (nest, &vloops);
1619   n = vloops.length ();
1620   loop_data_size = XNEWVEC (unsigned, n);
1621   volume = volume_of_references (refs);
1622   i = n;
1623   while (i-- != 0)
1624     {
1625       loop_data_size[i] = volume;
1626       /* Bound the volume by the L2 cache size, since above this bound,
1627 	 all dependence distances are equivalent.  */
1628       if (volume > L2_CACHE_SIZE_BYTES)
1629 	continue;
1630 
1631       aloop = vloops[i];
1632       vol = estimated_stmt_executions_int (aloop);
1633       if (vol == -1)
1634 	vol = expected_loop_iterations (aloop);
1635       volume *= vol;
1636     }
1637 
1638   /* Prepare the references in the form suitable for data dependence
1639      analysis.  We ignore unanalyzable data references (the results
1640      are used just as a heuristics to estimate temporality of the
1641      references, hence we do not need to worry about correctness).  */
1642   for (gr = refs; gr; gr = gr->next)
1643     for (ref = gr->refs; ref; ref = ref->next)
1644       {
1645 	dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1646 			      ref->mem, ref->stmt, !ref->write_p);
1647 
1648 	if (dr)
1649 	  {
1650 	    ref->reuse_distance = volume;
1651 	    dr->aux = ref;
1652 	    datarefs.safe_push (dr);
1653 	  }
1654 	else
1655 	  no_other_refs = false;
1656       }
1657 
1658   FOR_EACH_VEC_ELT (datarefs, i, dr)
1659     {
1660       dist = self_reuse_distance (dr, loop_data_size, n, loop);
1661       ref = (struct mem_ref *) dr->aux;
1662       if (ref->reuse_distance > dist)
1663 	ref->reuse_distance = dist;
1664 
1665       if (no_other_refs)
1666 	ref->independent_p = true;
1667     }
1668 
1669   if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1670     return false;
1671 
1672   FOR_EACH_VEC_ELT (dependences, i, dep)
1673     {
1674       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1675 	continue;
1676 
1677       ref = (struct mem_ref *) DDR_A (dep)->aux;
1678       refb = (struct mem_ref *) DDR_B (dep)->aux;
1679 
1680       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1681 	  || DDR_NUM_DIST_VECTS (dep) == 0)
1682 	{
1683 	  /* If the dependence cannot be analyzed, assume that there might be
1684 	     a reuse.  */
1685 	  dist = 0;
1686 
1687 	  ref->independent_p = false;
1688 	  refb->independent_p = false;
1689 	}
1690       else
1691 	{
1692 	  /* The distance vectors are normalized to be always lexicographically
1693 	     positive, hence we cannot tell just from them whether DDR_A comes
1694 	     before DDR_B or vice versa.  However, it is not important,
1695 	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
1696 	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1697 	     in cache (and marking it as nontemporal would not affect
1698 	     anything).  */
1699 
1700 	  dist = volume;
1701 	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1702 	    {
1703 	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1704 					     loop_data_size, n);
1705 
1706 	      /* If this is a dependence in the innermost loop (i.e., the
1707 		 distances in all superloops are zero) and it is not
1708 		 the trivial self-dependence with distance zero, record that
1709 		 the references are not completely independent.  */
1710 	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1711 		  && (ref != refb
1712 		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1713 		{
1714 		  ref->independent_p = false;
1715 		  refb->independent_p = false;
1716 		}
1717 
1718 	      /* Ignore accesses closer than
1719 		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1720 	      	 so that we use nontemporal prefetches e.g. if single memory
1721 		 location is accessed several times in a single iteration of
1722 		 the loop.  */
1723 	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1724 		continue;
1725 
1726 	      if (adist < dist)
1727 		dist = adist;
1728 	    }
1729 	}
1730 
1731       if (ref->reuse_distance > dist)
1732 	ref->reuse_distance = dist;
1733       if (refb->reuse_distance > dist)
1734 	refb->reuse_distance = dist;
1735     }
1736 
1737   free_dependence_relations (dependences);
1738   free_data_refs (datarefs);
1739   free (loop_data_size);
1740 
1741   if (dump_file && (dump_flags & TDF_DETAILS))
1742     {
1743       fprintf (dump_file, "Reuse distances:\n");
1744       for (gr = refs; gr; gr = gr->next)
1745 	for (ref = gr->refs; ref; ref = ref->next)
1746 	  fprintf (dump_file, " ref %p distance %u\n",
1747 		   (void *) ref, ref->reuse_distance);
1748     }
1749 
1750   return true;
1751 }
1752 
1753 /* Determine whether or not the trip count to ahead ratio is too small based
1754    on prefitablility consideration.
1755    AHEAD: the iteration ahead distance,
1756    EST_NITER: the estimated trip count.  */
1757 
1758 static bool
1759 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1760 {
1761   /* Assume trip count to ahead ratio is big enough if the trip count could not
1762      be estimated at compile time.  */
1763   if (est_niter < 0)
1764     return false;
1765 
1766   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1767     {
1768       if (dump_file && (dump_flags & TDF_DETAILS))
1769 	fprintf (dump_file,
1770 		 "Not prefetching -- loop estimated to roll only %d times\n",
1771 		 (int) est_niter);
1772       return true;
1773     }
1774 
1775   return false;
1776 }
1777 
1778 /* Determine whether or not the number of memory references in the loop is
1779    reasonable based on the profitablity and compilation time considerations.
1780    NINSNS: estimated number of instructions in the loop,
1781    MEM_REF_COUNT: total number of memory references in the loop.  */
1782 
1783 static bool
1784 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1785 {
1786   int insn_to_mem_ratio;
1787 
1788   if (mem_ref_count == 0)
1789     return false;
1790 
1791   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1792      (compute_all_dependences) have high costs based on quadratic complexity.
1793      To avoid huge compilation time, we give up prefetching if mem_ref_count
1794      is too large.  */
1795   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1796     return false;
1797 
1798   /* Prefetching improves performance by overlapping cache missing
1799      memory accesses with CPU operations.  If the loop does not have
1800      enough CPU operations to overlap with memory operations, prefetching
1801      won't give a significant benefit.  One approximate way of checking
1802      this is to require the ratio of instructions to memory references to
1803      be above a certain limit.  This approximation works well in practice.
1804      TODO: Implement a more precise computation by estimating the time
1805      for each CPU or memory op in the loop. Time estimates for memory ops
1806      should account for cache misses.  */
1807   insn_to_mem_ratio = ninsns / mem_ref_count;
1808 
1809   if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1810     {
1811       if (dump_file && (dump_flags & TDF_DETAILS))
1812         fprintf (dump_file,
1813 		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1814 		 insn_to_mem_ratio);
1815       return false;
1816     }
1817 
1818   return true;
1819 }
1820 
1821 /* Determine whether or not the instruction to prefetch ratio in the loop is
1822    too small based on the profitablity consideration.
1823    NINSNS: estimated number of instructions in the loop,
1824    PREFETCH_COUNT: an estimate of the number of prefetches,
1825    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1826 
1827 static bool
1828 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1829                                      unsigned unroll_factor)
1830 {
1831   int insn_to_prefetch_ratio;
1832 
1833   /* Prefetching most likely causes performance degradation when the instruction
1834      to prefetch ratio is too small.  Too many prefetch instructions in a loop
1835      may reduce the I-cache performance.
1836      (unroll_factor * ninsns) is used to estimate the number of instructions in
1837      the unrolled loop.  This implementation is a bit simplistic -- the number
1838      of issued prefetch instructions is also affected by unrolling.  So,
1839      prefetch_mod and the unroll factor should be taken into account when
1840      determining prefetch_count.  Also, the number of insns of the unrolled
1841      loop will usually be significantly smaller than the number of insns of the
1842      original loop * unroll_factor (at least the induction variable increases
1843      and the exit branches will get eliminated), so it might be better to use
1844      tree_estimate_loop_size + estimated_unrolled_size.  */
1845   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1846   if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1847     {
1848       if (dump_file && (dump_flags & TDF_DETAILS))
1849         fprintf (dump_file,
1850 		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1851 		 insn_to_prefetch_ratio);
1852       return true;
1853     }
1854 
1855   return false;
1856 }
1857 
1858 
1859 /* Issue prefetch instructions for array references in LOOP.  Returns
1860    true if the LOOP was unrolled.  */
1861 
1862 static bool
1863 loop_prefetch_arrays (struct loop *loop)
1864 {
1865   struct mem_ref_group *refs;
1866   unsigned ahead, ninsns, time, unroll_factor;
1867   HOST_WIDE_INT est_niter;
1868   struct tree_niter_desc desc;
1869   bool unrolled = false, no_other_refs;
1870   unsigned prefetch_count;
1871   unsigned mem_ref_count;
1872 
1873   if (optimize_loop_nest_for_size_p (loop))
1874     {
1875       if (dump_file && (dump_flags & TDF_DETAILS))
1876 	fprintf (dump_file, "  ignored (cold area)\n");
1877       return false;
1878     }
1879 
1880   /* FIXME: the time should be weighted by the probabilities of the blocks in
1881      the loop body.  */
1882   time = tree_num_loop_insns (loop, &eni_time_weights);
1883   if (time == 0)
1884     return false;
1885 
1886   ahead = (PREFETCH_LATENCY + time - 1) / time;
1887   est_niter = estimated_stmt_executions_int (loop);
1888   if (est_niter == -1)
1889     est_niter = max_stmt_executions_int (loop);
1890 
1891   /* Prefetching is not likely to be profitable if the trip count to ahead
1892      ratio is too small.  */
1893   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1894     return false;
1895 
1896   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1897 
1898   /* Step 1: gather the memory references.  */
1899   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1900 
1901   /* Give up prefetching if the number of memory references in the
1902      loop is not reasonable based on profitablity and compilation time
1903      considerations.  */
1904   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1905     goto fail;
1906 
1907   /* Step 2: estimate the reuse effects.  */
1908   prune_by_reuse (refs);
1909 
1910   if (nothing_to_prefetch_p (refs))
1911     goto fail;
1912 
1913   if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1914     goto fail;
1915 
1916   /* Step 3: determine unroll factor.  */
1917   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1918 					   est_niter);
1919 
1920   /* Estimate prefetch count for the unrolled loop.  */
1921   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1922   if (prefetch_count == 0)
1923     goto fail;
1924 
1925   if (dump_file && (dump_flags & TDF_DETAILS))
1926     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1927 	     HOST_WIDE_INT_PRINT_DEC "\n"
1928 	     "insn count %d, mem ref count %d, prefetch count %d\n",
1929 	     ahead, unroll_factor, est_niter,
1930 	     ninsns, mem_ref_count, prefetch_count);
1931 
1932   /* Prefetching is not likely to be profitable if the instruction to prefetch
1933      ratio is too small.  */
1934   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1935 					  unroll_factor))
1936     goto fail;
1937 
1938   mark_nontemporal_stores (loop, refs);
1939 
1940   /* Step 4: what to prefetch?  */
1941   if (!schedule_prefetches (refs, unroll_factor, ahead))
1942     goto fail;
1943 
1944   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1945      iterations so that we do not issue superfluous prefetches.  */
1946   if (unroll_factor != 1)
1947     {
1948       tree_unroll_loop (loop, unroll_factor,
1949 			single_dom_exit (loop), &desc);
1950       unrolled = true;
1951     }
1952 
1953   /* Step 6: issue the prefetches.  */
1954   issue_prefetches (refs, unroll_factor, ahead);
1955 
1956 fail:
1957   release_mem_refs (refs);
1958   return unrolled;
1959 }
1960 
1961 /* Issue prefetch instructions for array references in loops.  */
1962 
1963 unsigned int
1964 tree_ssa_prefetch_arrays (void)
1965 {
1966   struct loop *loop;
1967   bool unrolled = false;
1968   int todo_flags = 0;
1969 
1970   if (!HAVE_prefetch
1971       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1972 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1973 	 of processor costs and i486 does not have prefetch, but
1974 	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1975       || PREFETCH_BLOCK == 0)
1976     return 0;
1977 
1978   if (dump_file && (dump_flags & TDF_DETAILS))
1979     {
1980       fprintf (dump_file, "Prefetching parameters:\n");
1981       fprintf (dump_file, "    simultaneous prefetches: %d\n",
1982 	       SIMULTANEOUS_PREFETCHES);
1983       fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1984       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1985       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1986 	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1987       fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1988       fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1989       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1990 	       MIN_INSN_TO_PREFETCH_RATIO);
1991       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1992 	       PREFETCH_MIN_INSN_TO_MEM_RATIO);
1993       fprintf (dump_file, "\n");
1994     }
1995 
1996   initialize_original_copy_tables ();
1997 
1998   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1999     {
2000       tree type = build_function_type_list (void_type_node,
2001 					    const_ptr_type_node, NULL_TREE);
2002       tree decl = add_builtin_function ("__builtin_prefetch", type,
2003 					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
2004 					NULL, NULL_TREE);
2005       DECL_IS_NOVOPS (decl) = true;
2006       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
2007     }
2008 
2009   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
2010     {
2011       if (dump_file && (dump_flags & TDF_DETAILS))
2012 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
2013 
2014       unrolled |= loop_prefetch_arrays (loop);
2015 
2016       if (dump_file && (dump_flags & TDF_DETAILS))
2017 	fprintf (dump_file, "\n\n");
2018     }
2019 
2020   if (unrolled)
2021     {
2022       scev_reset ();
2023       todo_flags |= TODO_cleanup_cfg;
2024     }
2025 
2026   free_original_copy_tables ();
2027   return todo_flags;
2028 }
2029 
2030 /* Prefetching.  */
2031 
2032 namespace {
2033 
2034 const pass_data pass_data_loop_prefetch =
2035 {
2036   GIMPLE_PASS, /* type */
2037   "aprefetch", /* name */
2038   OPTGROUP_LOOP, /* optinfo_flags */
2039   TV_TREE_PREFETCH, /* tv_id */
2040   ( PROP_cfg | PROP_ssa ), /* properties_required */
2041   0, /* properties_provided */
2042   0, /* properties_destroyed */
2043   0, /* todo_flags_start */
2044   0, /* todo_flags_finish */
2045 };
2046 
2047 class pass_loop_prefetch : public gimple_opt_pass
2048 {
2049 public:
2050   pass_loop_prefetch (gcc::context *ctxt)
2051     : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2052   {}
2053 
2054   /* opt_pass methods: */
2055   virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2056   virtual unsigned int execute (function *);
2057 
2058 }; // class pass_loop_prefetch
2059 
2060 unsigned int
2061 pass_loop_prefetch::execute (function *fun)
2062 {
2063   if (number_of_loops (fun) <= 1)
2064     return 0;
2065 
2066   if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2067     {
2068       static bool warned = false;
2069 
2070       if (!warned)
2071 	{
2072 	  warning (OPT_Wdisabled_optimization,
2073 		   "%<l1-cache-size%> parameter is not a power of two %d",
2074 		   PREFETCH_BLOCK);
2075 	  warned = true;
2076 	}
2077       return 0;
2078     }
2079 
2080   return tree_ssa_prefetch_arrays ();
2081 }
2082 
2083 } // anon namespace
2084 
2085 gimple_opt_pass *
2086 make_pass_loop_prefetch (gcc::context *ctxt)
2087 {
2088   return new pass_loop_prefetch (ctxt);
2089 }
2090 
2091 
2092