xref: /openbsd-src/gnu/gcc/gcc/tree-ssa-loop-prefetch.c (revision 404b540a9034ac75a6199ad1a32d1bbc7a0d4210)
1 /* Array prefetching.
2    Copyright (C) 2005 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING.  If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "tree.h"
26 #include "rtl.h"
27 #include "tm_p.h"
28 #include "hard-reg-set.h"
29 #include "basic-block.h"
30 #include "output.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "varray.h"
37 #include "expr.h"
38 #include "tree-pass.h"
39 #include "ggc.h"
40 #include "insn-config.h"
41 #include "recog.h"
42 #include "hashtab.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
45 #include "toplev.h"
46 #include "params.h"
47 #include "langhooks.h"
48 
49 /* This pass inserts prefetch instructions to optimize cache usage during
50    accesses to arrays in loops.  It processes loops sequentially and:
51 
52    1) Gathers all memory references in the single loop.
53    2) For each of the references it decides when it is profitable to prefetch
54       it.  To do it, we evaluate the reuse among the accesses, and determines
55       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
59       (assuming cache line size is 64 bytes, char has size 1 byte and there
60       is no hardware sequential prefetch):
61 
62       char *a;
63       for (i = 0; i < max; i++)
64 	{
65 	  a[255] = ...;		(0)
66 	  a[i] = ...;		(1)
67 	  a[i + 64] = ...;	(2)
68 	  a[16*i] = ...;	(3)
69 	  a[187*i] = ...;	(4)
70 	  a[187*i + 50] = ...;	(5)
71 	}
72 
73        (0) obviously has PREFETCH_BEFORE 1
74        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75            location 64 iterations before it, and PREFETCH_MOD 64 (since
76 	   it hits the same cache line otherwise).
77        (2) has PREFETCH_MOD 64
78        (3) has PREFETCH_MOD 4
79        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
80            the cache line accessed by (4) is the same with probability only
81 	   7/32.
82        (5) has PREFETCH_MOD 1 as well.
83 
84    3) We determine how much ahead we need to prefetch.  The number of
85       iterations needed is time to fetch / time spent in one iteration of
86       the loop.  The problem is that we do not know either of these values,
87       so we just make a heuristic guess based on a magic (possibly)
88       target-specific constant and size of the loop.
89 
90    4) Determine which of the references we prefetch.  We take into account
91       that there is a maximum number of simultaneous prefetches (provided
92       by machine description).  We prefetch as many prefetches as possible
93       while still within this bound (starting with those with lowest
94       prefetch_mod, since they are responsible for most of the cache
95       misses).
96 
97    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99       prefetching nonaccessed memory.
100       TODO -- actually implement peeling.
101 
102    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
103       prefetch instructions with guards in cases where 5) was not sufficient
104       to satisfy the constraints?
105 
106    Some other TODO:
107       -- write and use more general reuse analysis (that could be also used
108 	 in other cache aimed loop optimizations)
109       -- make it behave sanely together with the prefetches given by user
110 	 (now we just ignore them; at the very least we should avoid
111 	 optimizing loops in that user put his own prefetches)
112       -- we assume cache line size alignment of arrays; this could be
113 	 improved.  */
114 
115 /* Magic constants follow.  These should be replaced by machine specific
116    numbers.  */
117 
118 /* A number that should roughly correspond to the number of instructions
119    executed before the prefetch is completed.  */
120 
121 #ifndef PREFETCH_LATENCY
122 #define PREFETCH_LATENCY 200
123 #endif
124 
125 /* Number of prefetches that can run at the same time.  */
126 
127 #ifndef SIMULTANEOUS_PREFETCHES
128 #define SIMULTANEOUS_PREFETCHES 3
129 #endif
130 
131 /* True if write can be prefetched by a read prefetch.  */
132 
133 #ifndef WRITE_CAN_USE_READ_PREFETCH
134 #define WRITE_CAN_USE_READ_PREFETCH 1
135 #endif
136 
137 /* True if read can be prefetched by a write prefetch. */
138 
139 #ifndef READ_CAN_USE_WRITE_PREFETCH
140 #define READ_CAN_USE_WRITE_PREFETCH 0
141 #endif
142 
143 /* Cache line size.  Assumed to be a power of two.  */
144 
145 #ifndef PREFETCH_BLOCK
146 #define PREFETCH_BLOCK 32
147 #endif
148 
149 /* Do we have a forward hardware sequential prefetching?  */
150 
151 #ifndef HAVE_FORWARD_PREFETCH
152 #define HAVE_FORWARD_PREFETCH 0
153 #endif
154 
155 /* Do we have a backward hardware sequential prefetching?  */
156 
157 #ifndef HAVE_BACKWARD_PREFETCH
158 #define HAVE_BACKWARD_PREFETCH 0
159 #endif
160 
161 /* In some cases we are only able to determine that there is a certain
162    probability that the two accesses hit the same cache line.  In this
163    case, we issue the prefetches for both of them if this probability
164    is less then (1000 - ACCEPTABLE_MISS_RATE) promile.  */
165 
166 #ifndef ACCEPTABLE_MISS_RATE
167 #define ACCEPTABLE_MISS_RATE 50
168 #endif
169 
170 #ifndef HAVE_prefetch
171 #define HAVE_prefetch 0
172 #endif
173 
174 /* The group of references between that reuse may occur.  */
175 
176 struct mem_ref_group
177 {
178   tree base;			/* Base of the reference.  */
179   HOST_WIDE_INT step;		/* Step of the reference.  */
180   struct mem_ref *refs;		/* References in the group.  */
181   struct mem_ref_group *next;	/* Next group of references.  */
182 };
183 
184 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
185 
186 #define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
187 
188 /* The memory reference.  */
189 
190 struct mem_ref
191 {
192   tree stmt;			/* Statement in that the reference appears.  */
193   tree mem;			/* The reference.  */
194   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
195   bool write_p;			/* Is it a write?  */
196   struct mem_ref_group *group;	/* The group of references it belongs to.  */
197   unsigned HOST_WIDE_INT prefetch_mod;
198 				/* Prefetch only each PREFETCH_MOD-th
199 				   iteration.  */
200   unsigned HOST_WIDE_INT prefetch_before;
201 				/* Prefetch only first PREFETCH_BEFORE
202 				   iterations.  */
203   bool issue_prefetch_p;	/* Should we really issue the prefetch?  */
204   struct mem_ref *next;		/* The next reference in the group.  */
205 };
206 
207 /* Dumps information about reference REF to FILE.  */
208 
209 static void
dump_mem_ref(FILE * file,struct mem_ref * ref)210 dump_mem_ref (FILE *file, struct mem_ref *ref)
211 {
212   fprintf (file, "Reference %p:\n", (void *) ref);
213 
214   fprintf (file, "  group %p (base ", (void *) ref->group);
215   print_generic_expr (file, ref->group->base, TDF_SLIM);
216   fprintf (file, ", step ");
217   fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
218   fprintf (file, ")\n");
219 
220   fprintf (dump_file, "  delta ");
221   fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
222   fprintf (file, "\n");
223 
224   fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
225 
226   fprintf (file, "\n");
227 }
228 
229 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
230    exist.  */
231 
232 static struct mem_ref_group *
find_or_create_group(struct mem_ref_group ** groups,tree base,HOST_WIDE_INT step)233 find_or_create_group (struct mem_ref_group **groups, tree base,
234 		      HOST_WIDE_INT step)
235 {
236   struct mem_ref_group *group;
237 
238   for (; *groups; groups = &(*groups)->next)
239     {
240       if ((*groups)->step == step
241 	  && operand_equal_p ((*groups)->base, base, 0))
242 	return *groups;
243 
244       /* Keep the list of groups sorted by decreasing step.  */
245       if ((*groups)->step < step)
246 	break;
247     }
248 
249   group = xcalloc (1, sizeof (struct mem_ref_group));
250   group->base = base;
251   group->step = step;
252   group->refs = NULL;
253   group->next = *groups;
254   *groups = group;
255 
256   return group;
257 }
258 
259 /* Records a memory reference MEM in GROUP with offset DELTA and write status
260    WRITE_P.  The reference occurs in statement STMT.  */
261 
262 static void
record_ref(struct mem_ref_group * group,tree stmt,tree mem,HOST_WIDE_INT delta,bool write_p)263 record_ref (struct mem_ref_group *group, tree stmt, tree mem,
264 	    HOST_WIDE_INT delta, bool write_p)
265 {
266   struct mem_ref **aref;
267 
268   /* Do not record the same address twice.  */
269   for (aref = &group->refs; *aref; aref = &(*aref)->next)
270     {
271       /* It does not have to be possible for write reference to reuse the read
272 	 prefetch, or vice versa.  */
273       if (!WRITE_CAN_USE_READ_PREFETCH
274 	  && write_p
275 	  && !(*aref)->write_p)
276 	continue;
277       if (!READ_CAN_USE_WRITE_PREFETCH
278 	  && !write_p
279 	  && (*aref)->write_p)
280 	continue;
281 
282       if ((*aref)->delta == delta)
283 	return;
284     }
285 
286   (*aref) = xcalloc (1, sizeof (struct mem_ref));
287   (*aref)->stmt = stmt;
288   (*aref)->mem = mem;
289   (*aref)->delta = delta;
290   (*aref)->write_p = write_p;
291   (*aref)->prefetch_before = PREFETCH_ALL;
292   (*aref)->prefetch_mod = 1;
293   (*aref)->issue_prefetch_p = false;
294   (*aref)->group = group;
295   (*aref)->next = NULL;
296 
297   if (dump_file && (dump_flags & TDF_DETAILS))
298     dump_mem_ref (dump_file, *aref);
299 }
300 
301 /* Release memory references in GROUPS.  */
302 
303 static void
release_mem_refs(struct mem_ref_group * groups)304 release_mem_refs (struct mem_ref_group *groups)
305 {
306   struct mem_ref_group *next_g;
307   struct mem_ref *ref, *next_r;
308 
309   for (; groups; groups = next_g)
310     {
311       next_g = groups->next;
312       for (ref = groups->refs; ref; ref = next_r)
313 	{
314 	  next_r = ref->next;
315 	  free (ref);
316 	}
317       free (groups);
318     }
319 }
320 
321 /* A structure used to pass arguments to idx_analyze_ref.  */
322 
323 struct ar_data
324 {
325   struct loop *loop;			/* Loop of the reference.  */
326   tree stmt;				/* Statement of the reference.  */
327   HOST_WIDE_INT *step;			/* Step of the memory reference.  */
328   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
329 };
330 
331 /* Analyzes a single INDEX of a memory reference to obtain information
332    described at analyze_ref.  Callback for for_each_index.  */
333 
334 static bool
idx_analyze_ref(tree base,tree * index,void * data)335 idx_analyze_ref (tree base, tree *index, void *data)
336 {
337   struct ar_data *ar_data = data;
338   tree ibase, step, stepsize;
339   HOST_WIDE_INT istep, idelta = 0, imult = 1;
340   affine_iv iv;
341 
342   if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
343       || TREE_CODE (base) == ALIGN_INDIRECT_REF)
344     return false;
345 
346   if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
347     return false;
348   ibase = iv.base;
349   step = iv.step;
350 
351   if (zero_p (step))
352     istep = 0;
353   else
354     {
355       if (!cst_and_fits_in_hwi (step))
356 	return false;
357       istep = int_cst_value (step);
358     }
359 
360   if (TREE_CODE (ibase) == PLUS_EXPR
361       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
362     {
363       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
364       ibase = TREE_OPERAND (ibase, 0);
365     }
366   if (cst_and_fits_in_hwi (ibase))
367     {
368       idelta += int_cst_value (ibase);
369       ibase = build_int_cst (TREE_TYPE (ibase), 0);
370     }
371 
372   if (TREE_CODE (base) == ARRAY_REF)
373     {
374       stepsize = array_ref_element_size (base);
375       if (!cst_and_fits_in_hwi (stepsize))
376 	return false;
377       imult = int_cst_value (stepsize);
378 
379       istep *= imult;
380       idelta *= imult;
381     }
382 
383   *ar_data->step += istep;
384   *ar_data->delta += idelta;
385   *index = ibase;
386 
387   return true;
388 }
389 
390 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
391    STEP are integer constants and iter is number of iterations of LOOP.  The
392    reference occurs in statement STMT.  Strips nonaddressable component
393    references from REF_P.  */
394 
395 static bool
analyze_ref(struct loop * loop,tree * ref_p,tree * base,HOST_WIDE_INT * step,HOST_WIDE_INT * delta,tree stmt)396 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
397 	     HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
398 	     tree stmt)
399 {
400   struct ar_data ar_data;
401   tree off;
402   HOST_WIDE_INT bit_offset;
403   tree ref = *ref_p;
404 
405   *step = 0;
406   *delta = 0;
407 
408   /* First strip off the component references.  Ignore bitfields.  */
409   if (TREE_CODE (ref) == COMPONENT_REF
410       && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
411     ref = TREE_OPERAND (ref, 0);
412 
413   *ref_p = ref;
414 
415   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
416     {
417       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
418       bit_offset = TREE_INT_CST_LOW (off);
419       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
420 
421       *delta += bit_offset / BITS_PER_UNIT;
422     }
423 
424   *base = unshare_expr (ref);
425   ar_data.loop = loop;
426   ar_data.stmt = stmt;
427   ar_data.step = step;
428   ar_data.delta = delta;
429   return for_each_index (base, idx_analyze_ref, &ar_data);
430 }
431 
432 /* Record a memory reference REF to the list REFS.  The reference occurs in
433    LOOP in statement STMT and it is write if WRITE_P.  */
434 
435 static void
gather_memory_references_ref(struct loop * loop,struct mem_ref_group ** refs,tree ref,bool write_p,tree stmt)436 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
437 			      tree ref, bool write_p, tree stmt)
438 {
439   tree base;
440   HOST_WIDE_INT step, delta;
441   struct mem_ref_group *agrp;
442 
443   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
444     return;
445 
446   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
447      are integer constants.  */
448   agrp = find_or_create_group (refs, base, step);
449   record_ref (agrp, stmt, ref, delta, write_p);
450 }
451 
452 /* Record the suitable memory references in LOOP.  */
453 
454 static struct mem_ref_group *
gather_memory_references(struct loop * loop)455 gather_memory_references (struct loop *loop)
456 {
457   basic_block *body = get_loop_body_in_dom_order (loop);
458   basic_block bb;
459   unsigned i;
460   block_stmt_iterator bsi;
461   tree stmt, lhs, rhs;
462   struct mem_ref_group *refs = NULL;
463 
464   /* Scan the loop body in order, so that the former references precede the
465      later ones.  */
466   for (i = 0; i < loop->num_nodes; i++)
467     {
468       bb = body[i];
469       if (bb->loop_father != loop)
470 	continue;
471 
472       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
473 	{
474 	  stmt = bsi_stmt (bsi);
475 	  if (TREE_CODE (stmt) != MODIFY_EXPR)
476 	    continue;
477 
478 	  lhs = TREE_OPERAND (stmt, 0);
479 	  rhs = TREE_OPERAND (stmt, 1);
480 
481 	  if (REFERENCE_CLASS_P (rhs))
482 	    gather_memory_references_ref (loop, &refs, rhs, false, stmt);
483 	  if (REFERENCE_CLASS_P (lhs))
484 	    gather_memory_references_ref (loop, &refs, lhs, true, stmt);
485 	}
486     }
487   free (body);
488 
489   return refs;
490 }
491 
492 /* Prune the prefetch candidate REF using the self-reuse.  */
493 
494 static void
prune_ref_by_self_reuse(struct mem_ref * ref)495 prune_ref_by_self_reuse (struct mem_ref *ref)
496 {
497   HOST_WIDE_INT step = ref->group->step;
498   bool backward = step < 0;
499 
500   if (step == 0)
501     {
502       /* Prefetch references to invariant address just once.  */
503       ref->prefetch_before = 1;
504       return;
505     }
506 
507   if (backward)
508     step = -step;
509 
510   if (step > PREFETCH_BLOCK)
511     return;
512 
513   if ((backward && HAVE_BACKWARD_PREFETCH)
514       || (!backward && HAVE_FORWARD_PREFETCH))
515     {
516       ref->prefetch_before = 1;
517       return;
518     }
519 
520   ref->prefetch_mod = PREFETCH_BLOCK / step;
521 }
522 
523 /* Divides X by BY, rounding down.  */
524 
525 static HOST_WIDE_INT
ddown(HOST_WIDE_INT x,unsigned HOST_WIDE_INT by)526 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
527 {
528   gcc_assert (by > 0);
529 
530   if (x >= 0)
531     return x / by;
532   else
533     return (x + by - 1) / by;
534 }
535 
536 /* Prune the prefetch candidate REF using the reuse with BY.
537    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
538 
539 static void
prune_ref_by_group_reuse(struct mem_ref * ref,struct mem_ref * by,bool by_is_before)540 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
541 			  bool by_is_before)
542 {
543   HOST_WIDE_INT step = ref->group->step;
544   bool backward = step < 0;
545   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
546   HOST_WIDE_INT delta = delta_b - delta_r;
547   HOST_WIDE_INT hit_from;
548   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
549 
550   if (delta == 0)
551     {
552       /* If the references has the same address, only prefetch the
553 	 former.  */
554       if (by_is_before)
555 	ref->prefetch_before = 0;
556 
557       return;
558     }
559 
560   if (!step)
561     {
562       /* If the reference addresses are invariant and fall into the
563 	 same cache line, prefetch just the first one.  */
564       if (!by_is_before)
565 	return;
566 
567       if (ddown (ref->delta, PREFETCH_BLOCK)
568 	  != ddown (by->delta, PREFETCH_BLOCK))
569 	return;
570 
571       ref->prefetch_before = 0;
572       return;
573     }
574 
575   /* Only prune the reference that is behind in the array.  */
576   if (backward)
577     {
578       if (delta > 0)
579 	return;
580 
581       /* Transform the data so that we may assume that the accesses
582 	 are forward.  */
583       delta = - delta;
584       step = -step;
585       delta_r = PREFETCH_BLOCK - 1 - delta_r;
586       delta_b = PREFETCH_BLOCK - 1 - delta_b;
587     }
588   else
589     {
590       if (delta < 0)
591 	return;
592     }
593 
594   /* Check whether the two references are likely to hit the same cache
595      line, and how distant the iterations in that it occurs are from
596      each other.  */
597 
598   if (step <= PREFETCH_BLOCK)
599     {
600       /* The accesses are sure to meet.  Let us check when.  */
601       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
602       prefetch_before = (hit_from - delta_r + step - 1) / step;
603 
604       if (prefetch_before < ref->prefetch_before)
605 	ref->prefetch_before = prefetch_before;
606 
607       return;
608     }
609 
610   /* A more complicated case.  First let us ensure that size of cache line
611      and step are coprime (here we assume that PREFETCH_BLOCK is a power
612      of two.  */
613   prefetch_block = PREFETCH_BLOCK;
614   while ((step & 1) == 0
615 	 && prefetch_block > 1)
616     {
617       step >>= 1;
618       prefetch_block >>= 1;
619       delta >>= 1;
620     }
621 
622   /* Now step > prefetch_block, and step and prefetch_block are coprime.
623      Determine the probability that the accesses hit the same cache line.  */
624 
625   prefetch_before = delta / step;
626   delta %= step;
627   if ((unsigned HOST_WIDE_INT) delta
628       <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
629     {
630       if (prefetch_before < ref->prefetch_before)
631 	ref->prefetch_before = prefetch_before;
632 
633       return;
634     }
635 
636   /* Try also the following iteration.  */
637   prefetch_before++;
638   delta = step - delta;
639   if ((unsigned HOST_WIDE_INT) delta
640       <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
641     {
642       if (prefetch_before < ref->prefetch_before)
643 	ref->prefetch_before = prefetch_before;
644 
645       return;
646     }
647 
648   /* The ref probably does not reuse by.  */
649   return;
650 }
651 
652 /* Prune the prefetch candidate REF using the reuses with other references
653    in REFS.  */
654 
655 static void
prune_ref_by_reuse(struct mem_ref * ref,struct mem_ref * refs)656 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
657 {
658   struct mem_ref *prune_by;
659   bool before = true;
660 
661   prune_ref_by_self_reuse (ref);
662 
663   for (prune_by = refs; prune_by; prune_by = prune_by->next)
664     {
665       if (prune_by == ref)
666 	{
667 	  before = false;
668 	  continue;
669 	}
670 
671       if (!WRITE_CAN_USE_READ_PREFETCH
672 	  && ref->write_p
673 	  && !prune_by->write_p)
674 	continue;
675       if (!READ_CAN_USE_WRITE_PREFETCH
676 	  && !ref->write_p
677 	  && prune_by->write_p)
678 	continue;
679 
680       prune_ref_by_group_reuse (ref, prune_by, before);
681     }
682 }
683 
684 /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
685 
686 static void
prune_group_by_reuse(struct mem_ref_group * group)687 prune_group_by_reuse (struct mem_ref_group *group)
688 {
689   struct mem_ref *ref_pruned;
690 
691   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
692     {
693       prune_ref_by_reuse (ref_pruned, group->refs);
694 
695       if (dump_file && (dump_flags & TDF_DETAILS))
696 	{
697 	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
698 
699 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
700 	      && ref_pruned->prefetch_mod == 1)
701 	    fprintf (dump_file, " no restrictions");
702 	  else if (ref_pruned->prefetch_before == 0)
703 	    fprintf (dump_file, " do not prefetch");
704 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
705 	    fprintf (dump_file, " prefetch once");
706 	  else
707 	    {
708 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
709 		{
710 		  fprintf (dump_file, " prefetch before ");
711 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
712 			   ref_pruned->prefetch_before);
713 		}
714 	      if (ref_pruned->prefetch_mod != 1)
715 		{
716 		  fprintf (dump_file, " prefetch mod ");
717 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
718 			   ref_pruned->prefetch_mod);
719 		}
720 	    }
721 	  fprintf (dump_file, "\n");
722 	}
723     }
724 }
725 
726 /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
727 
728 static void
prune_by_reuse(struct mem_ref_group * groups)729 prune_by_reuse (struct mem_ref_group *groups)
730 {
731   for (; groups; groups = groups->next)
732     prune_group_by_reuse (groups);
733 }
734 
735 /* Returns true if we should issue prefetch for REF.  */
736 
737 static bool
should_issue_prefetch_p(struct mem_ref * ref)738 should_issue_prefetch_p (struct mem_ref *ref)
739 {
740   /* For now do not issue prefetches for only first few of the
741      iterations.  */
742   if (ref->prefetch_before != PREFETCH_ALL)
743     return false;
744 
745   return true;
746 }
747 
748 /* Decide which of the prefetch candidates in GROUPS to prefetch.
749    AHEAD is the number of iterations to prefetch ahead (which corresponds
750    to the number of simultaneous instances of one prefetch running at a
751    time).  UNROLL_FACTOR is the factor by that the loop is going to be
752    unrolled.  Returns true if there is anything to prefetch.  */
753 
754 static bool
schedule_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)755 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
756 		     unsigned ahead)
757 {
758   unsigned max_prefetches, n_prefetches;
759   struct mem_ref *ref;
760   bool any = false;
761 
762   max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
763   if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
764     max_prefetches = SIMULTANEOUS_PREFETCHES;
765 
766   if (dump_file && (dump_flags & TDF_DETAILS))
767     fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
768 
769   if (!max_prefetches)
770     return false;
771 
772   /* For now we just take memory references one by one and issue
773      prefetches for as many as possible.  The groups are sorted
774      starting with the largest step, since the references with
775      large step are more likely to cause many cache misses.  */
776 
777   for (; groups; groups = groups->next)
778     for (ref = groups->refs; ref; ref = ref->next)
779       {
780 	if (!should_issue_prefetch_p (ref))
781 	  continue;
782 
783 	ref->issue_prefetch_p = true;
784 
785 	/* If prefetch_mod is less then unroll_factor, we need to insert
786 	   several prefetches for the reference.  */
787 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
788 			/ ref->prefetch_mod);
789 	if (max_prefetches <= n_prefetches)
790 	  return true;
791 
792 	max_prefetches -= n_prefetches;
793 	any = true;
794       }
795 
796   return any;
797 }
798 
799 /* Determine whether there is any reference suitable for prefetching
800    in GROUPS.  */
801 
802 static bool
anything_to_prefetch_p(struct mem_ref_group * groups)803 anything_to_prefetch_p (struct mem_ref_group *groups)
804 {
805   struct mem_ref *ref;
806 
807   for (; groups; groups = groups->next)
808     for (ref = groups->refs; ref; ref = ref->next)
809       if (should_issue_prefetch_p (ref))
810 	return true;
811 
812   return false;
813 }
814 
815 /* Issue prefetches for the reference REF into loop as decided before.
816    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
817    is the factor by which LOOP was unrolled.  */
818 
819 static void
issue_prefetch_ref(struct mem_ref * ref,unsigned unroll_factor,unsigned ahead)820 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
821 {
822   HOST_WIDE_INT delta;
823   tree addr, addr_base, prefetch, params, write_p;
824   block_stmt_iterator bsi;
825   unsigned n_prefetches, ap;
826 
827   if (dump_file && (dump_flags & TDF_DETAILS))
828     fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
829 
830   bsi = bsi_for_stmt (ref->stmt);
831 
832   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
833 		  / ref->prefetch_mod);
834   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
835   addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
836 
837   for (ap = 0; ap < n_prefetches; ap++)
838     {
839       /* Determine the address to prefetch.  */
840       delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
841       addr = fold_build2 (PLUS_EXPR, ptr_type_node,
842 			  addr_base, build_int_cst (ptr_type_node, delta));
843       addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
844 
845       /* Create the prefetch instruction.  */
846       write_p = ref->write_p ? integer_one_node : integer_zero_node;
847       params = tree_cons (NULL_TREE, addr,
848 			  tree_cons (NULL_TREE, write_p, NULL_TREE));
849 
850       prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
851 					   params);
852       bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
853     }
854 }
855 
856 /* Issue prefetches for the references in GROUPS into loop as decided before.
857    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
858    factor by that LOOP was unrolled.  */
859 
860 static void
issue_prefetches(struct mem_ref_group * groups,unsigned unroll_factor,unsigned ahead)861 issue_prefetches (struct mem_ref_group *groups,
862 		  unsigned unroll_factor, unsigned ahead)
863 {
864   struct mem_ref *ref;
865 
866   for (; groups; groups = groups->next)
867     for (ref = groups->refs; ref; ref = ref->next)
868       if (ref->issue_prefetch_p)
869 	issue_prefetch_ref (ref, unroll_factor, ahead);
870 }
871 
872 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
873    this is the case, fill in DESC by the description of number of
874    iterations.  */
875 
876 static bool
should_unroll_loop_p(struct loop * loop,struct tree_niter_desc * desc,unsigned factor)877 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
878 		      unsigned factor)
879 {
880   if (!can_unroll_loop_p (loop, factor, desc))
881     return false;
882 
883   /* We only consider loops without control flow for unrolling.  This is not
884      a hard restriction -- tree_unroll_loop works with arbitrary loops
885      as well; but the unrolling/prefetching is usually more profitable for
886      loops consisting of a single basic block, and we want to limit the
887      code growth.  */
888   if (loop->num_nodes > 2)
889     return false;
890 
891   return true;
892 }
893 
894 /* Determine the coefficient by that unroll LOOP, from the information
895    contained in the list of memory references REFS.  Description of
896    umber of iterations of LOOP is stored to DESC.  AHEAD is the number
897    of iterations ahead that we need to prefetch.  NINSNS is number of
898    insns of the LOOP.  */
899 
900 static unsigned
determine_unroll_factor(struct loop * loop,struct mem_ref_group * refs,unsigned ahead,unsigned ninsns,struct tree_niter_desc * desc)901 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
902 			 unsigned ahead, unsigned ninsns,
903 			 struct tree_niter_desc *desc)
904 {
905   unsigned upper_bound, size_factor, constraint_factor;
906   unsigned factor, max_mod_constraint, ahead_factor;
907   struct mem_ref_group *agp;
908   struct mem_ref *ref;
909 
910   upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
911 
912   /* First check whether the loop is not too large to unroll.  */
913   size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
914   if (size_factor <= 1)
915     return 1;
916 
917   if (size_factor < upper_bound)
918     upper_bound = size_factor;
919 
920   max_mod_constraint = 1;
921   for (agp = refs; agp; agp = agp->next)
922     for (ref = agp->refs; ref; ref = ref->next)
923       if (should_issue_prefetch_p (ref)
924 	  && ref->prefetch_mod > max_mod_constraint)
925 	max_mod_constraint = ref->prefetch_mod;
926 
927   /* Set constraint_factor as large as needed to be able to satisfy the
928      largest modulo constraint.  */
929   constraint_factor = max_mod_constraint;
930 
931   /* If ahead is too large in comparison with the number of available
932      prefetches, unroll the loop as much as needed to be able to prefetch
933      at least partially some of the references in the loop.  */
934   ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
935 		  / SIMULTANEOUS_PREFETCHES);
936 
937   /* Unroll as much as useful, but bound the code size growth.  */
938   if (constraint_factor < ahead_factor)
939     factor = ahead_factor;
940   else
941     factor = constraint_factor;
942   if (factor > upper_bound)
943     factor = upper_bound;
944 
945   if (!should_unroll_loop_p (loop, desc, factor))
946     return 1;
947 
948   return factor;
949 }
950 
951 /* Issue prefetch instructions for array references in LOOP.  Returns
952    true if the LOOP was unrolled.  LOOPS is the array containing all
953    loops.  */
954 
955 static bool
loop_prefetch_arrays(struct loops * loops,struct loop * loop)956 loop_prefetch_arrays (struct loops *loops, struct loop *loop)
957 {
958   struct mem_ref_group *refs;
959   unsigned ahead, ninsns, unroll_factor;
960   struct tree_niter_desc desc;
961   bool unrolled = false;
962 
963   /* Step 1: gather the memory references.  */
964   refs = gather_memory_references (loop);
965 
966   /* Step 2: estimate the reuse effects.  */
967   prune_by_reuse (refs);
968 
969   if (!anything_to_prefetch_p (refs))
970     goto fail;
971 
972   /* Step 3: determine the ahead and unroll factor.  */
973 
974   /* FIXME: We should use not size of the loop, but the average number of
975      instructions executed per iteration of the loop.  */
976   ninsns = tree_num_loop_insns (loop);
977   ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
978   unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
979 					   &desc);
980   if (dump_file && (dump_flags & TDF_DETAILS))
981     fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
982 
983   /* If the loop rolls less than the required unroll factor, prefetching
984      is useless.  */
985   if (unroll_factor > 1
986       && cst_and_fits_in_hwi (desc.niter)
987       && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
988     goto fail;
989 
990   /* Step 4: what to prefetch?  */
991   if (!schedule_prefetches (refs, unroll_factor, ahead))
992     goto fail;
993 
994   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
995      iterations so that we do not issue superfluous prefetches.  */
996   if (unroll_factor != 1)
997     {
998       tree_unroll_loop (loops, loop, unroll_factor,
999 			single_dom_exit (loop), &desc);
1000       unrolled = true;
1001     }
1002 
1003   /* Step 6: issue the prefetches.  */
1004   issue_prefetches (refs, unroll_factor, ahead);
1005 
1006 fail:
1007   release_mem_refs (refs);
1008   return unrolled;
1009 }
1010 
1011 /* Issue prefetch instructions for array references in LOOPS.  */
1012 
1013 unsigned int
tree_ssa_prefetch_arrays(struct loops * loops)1014 tree_ssa_prefetch_arrays (struct loops *loops)
1015 {
1016   unsigned i;
1017   struct loop *loop;
1018   bool unrolled = false;
1019   int todo_flags = 0;
1020 
1021   if (!HAVE_prefetch
1022       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1023 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1024 	 of processor costs and i486 does not have prefetch, but
1025 	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1026       || PREFETCH_BLOCK == 0)
1027     return 0;
1028 
1029   initialize_original_copy_tables ();
1030 
1031   if (!built_in_decls[BUILT_IN_PREFETCH])
1032     {
1033       tree type = build_function_type (void_type_node,
1034 				       tree_cons (NULL_TREE,
1035 						  const_ptr_type_node,
1036 						  NULL_TREE));
1037       tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
1038 			BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1039 			NULL, NULL_TREE);
1040       DECL_IS_NOVOPS (decl) = true;
1041       built_in_decls[BUILT_IN_PREFETCH] = decl;
1042     }
1043 
1044   /* We assume that size of cache line is a power of two, so verify this
1045      here.  */
1046   gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1047 
1048   for (i = loops->num - 1; i > 0; i--)
1049     {
1050       loop = loops->parray[i];
1051 
1052       if (dump_file && (dump_flags & TDF_DETAILS))
1053 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
1054 
1055       if (loop)
1056 	unrolled |= loop_prefetch_arrays (loops, loop);
1057 
1058       if (dump_file && (dump_flags & TDF_DETAILS))
1059 	fprintf (dump_file, "\n\n");
1060     }
1061 
1062   if (unrolled)
1063     {
1064       scev_reset ();
1065       todo_flags |= TODO_cleanup_cfg;
1066     }
1067 
1068   free_original_copy_tables ();
1069   return todo_flags;
1070 }
1071