xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-data-refs.c (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "internal-fn.h"
54 
55 /* Return true if load- or store-lanes optab OPTAB is implemented for
56    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
57 
58 static bool
59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
60 			      tree vectype, unsigned HOST_WIDE_INT count)
61 {
62   machine_mode mode, array_mode;
63   bool limit_p;
64 
65   mode = TYPE_MODE (vectype);
66   limit_p = !targetm.array_mode_supported_p (mode, count);
67   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
68 			      MODE_INT, limit_p);
69 
70   if (array_mode == BLKmode)
71     {
72       if (dump_enabled_p ())
73 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75                          GET_MODE_NAME (mode), count);
76       return false;
77     }
78 
79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80     {
81       if (dump_enabled_p ())
82 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83                          "cannot use %s<%s><%s>\n", name,
84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85       return false;
86     }
87 
88   if (dump_enabled_p ())
89     dump_printf_loc (MSG_NOTE, vect_location,
90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91                      GET_MODE_NAME (mode));
92 
93   return true;
94 }
95 
96 
97 /* Return the smallest scalar part of STMT.
98    This is used to determine the vectype of the stmt.  We generally set the
99    vectype according to the type of the result (lhs).  For stmts whose
100    result-type is different than the type of the arguments (e.g., demotion,
101    promotion), vectype will be reset appropriately (later).  Note that we have
102    to visit the smallest datatype in this function, because that determines the
103    VF.  If the smallest datatype in the loop is present only as the rhs of a
104    promotion operation - we'd miss it.
105    Such a case, where a variable of this datatype does not appear in the lhs
106    anywhere in the loop, can only occur if it's an invariant: e.g.:
107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
108    invariant motion.  However, we cannot rely on invariant motion to always
109    take invariants out of the loop, and so in the case of promotion we also
110    have to check the rhs.
111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112    types.  */
113 
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116                                HOST_WIDE_INT *rhs_size_unit)
117 {
118   tree scalar_type = gimple_expr_type (stmt);
119   HOST_WIDE_INT lhs, rhs;
120 
121   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 
123   if (is_gimple_assign (stmt)
124       && (gimple_assign_cast_p (stmt)
125           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128     {
129       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 
131       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132       if (rhs < lhs)
133         scalar_type = rhs_type;
134     }
135   else if (gcall *call = dyn_cast <gcall *> (stmt))
136     {
137       unsigned int i = 0;
138       if (gimple_call_internal_p (call))
139 	{
140 	  internal_fn ifn = gimple_call_internal_fn (call);
141 	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
142 	    /* gimple_expr_type already picked the type of the loaded
143 	       or stored data.  */
144 	    i = ~0U;
145 	  else if (internal_fn_mask_index (ifn) == 0)
146 	    i = 1;
147 	}
148       if (i < gimple_call_num_args (call))
149 	{
150 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
151 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
152 	    {
153 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
154 	      if (rhs < lhs)
155 		scalar_type = rhs_type;
156 	    }
157 	}
158     }
159 
160   *lhs_size_unit = lhs;
161   *rhs_size_unit = rhs;
162   return scalar_type;
163 }
164 
165 
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167    tested at run-time.  Return TRUE if DDR was successfully inserted.
168    Return false if versioning is not supported.  */
169 
170 static bool
171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
172 {
173   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174 
175   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
176     return false;
177 
178   if (dump_enabled_p ())
179     {
180       dump_printf_loc (MSG_NOTE, vect_location,
181                        "mark for run-time aliasing test between ");
182       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
183       dump_printf (MSG_NOTE,  " and ");
184       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
185       dump_printf (MSG_NOTE, "\n");
186     }
187 
188   if (optimize_loop_nest_for_size_p (loop))
189     {
190       if (dump_enabled_p ())
191 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192                          "versioning not supported when optimizing"
193                          " for size.\n");
194       return false;
195     }
196 
197   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
198   if (loop->inner)
199     {
200       if (dump_enabled_p ())
201 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
202                          "versioning not yet supported for outer-loops.\n");
203       return false;
204     }
205 
206   /* FORNOW: We don't support creating runtime alias tests for non-constant
207      step.  */
208   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
209       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210     {
211       if (dump_enabled_p ())
212 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213                          "versioning not yet supported for non-constant "
214                          "step\n");
215       return false;
216     }
217 
218   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
219   return true;
220 }
221 
222 
223 /* Return true if we know that the order of vectorized STMT_A and
224    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
225    At least one of the statements is a write.  */
226 
227 static bool
228 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
229 {
230   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
231   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
232 
233   /* Single statements are always kept in their original order.  */
234   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
235       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
236     return true;
237 
238   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
239      group are emitted at the position of the last scalar load and all
240      stores in a group are emitted at the position of the last scalar store.
241      Compute that position and check whether the resulting order matches
242      the current one.  */
243   gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
244   if (last_a)
245     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
246 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247       last_a = get_later_stmt (last_a, s);
248   else
249     last_a = stmt_a;
250   gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
251   if (last_b)
252     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
253 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
254       last_b = get_later_stmt (last_b, s);
255   else
256     last_b = stmt_b;
257   return ((get_later_stmt (last_a, last_b) == last_a)
258 	  == (get_later_stmt (stmt_a, stmt_b) == stmt_a));
259 }
260 
261 
262 /* Function vect_analyze_data_ref_dependence.
263 
264    Return TRUE if there (might) exist a dependence between a memory-reference
265    DRA and a memory-reference DRB.  When versioning for alias may check a
266    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
267    the data dependence.  */
268 
269 static bool
270 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
271                                   loop_vec_info loop_vinfo, int *max_vf)
272 {
273   unsigned int i;
274   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
275   struct data_reference *dra = DDR_A (ddr);
276   struct data_reference *drb = DDR_B (ddr);
277   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
278   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
279   lambda_vector dist_v;
280   unsigned int loop_depth;
281 
282   /* In loop analysis all data references should be vectorizable.  */
283   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
284       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
285     gcc_unreachable ();
286 
287   /* Independent data accesses.  */
288   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
289     return false;
290 
291   if (dra == drb
292       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
293     return false;
294 
295   /* We do not have to consider dependences between accesses that belong
296      to the same group.  */
297   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
298       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
299     return false;
300 
301   /* Even if we have an anti-dependence then, as the vectorized loop covers at
302      least two scalar iterations, there is always also a true dependence.
303      As the vectorizer does not re-order loads and stores we can ignore
304      the anti-dependence if TBAA can disambiguate both DRs similar to the
305      case with known negative distance anti-dependences (positive
306      distance anti-dependences would violate TBAA constraints).  */
307   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
308        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
309       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
310 				 get_alias_set (DR_REF (drb))))
311     return false;
312 
313   /* Unknown data dependence.  */
314   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
315     {
316       /* If user asserted safelen consecutive iterations can be
317 	 executed concurrently, assume independence.  */
318       if (loop->safelen >= 2)
319 	{
320 	  if (loop->safelen < *max_vf)
321 	    *max_vf = loop->safelen;
322 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
323 	  return false;
324 	}
325 
326       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
327 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
328 	{
329 	  if (dump_enabled_p ())
330 	    {
331 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
332 			       "versioning for alias not supported for: "
333 			       "can't determine dependence between ");
334 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
335 				 DR_REF (dra));
336 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
337 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
338 				 DR_REF (drb));
339 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
340 	    }
341 	  return true;
342 	}
343 
344       if (dump_enabled_p ())
345 	{
346 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
347 			   "versioning for alias required: "
348 			   "can't determine dependence between ");
349 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
350 			     DR_REF (dra));
351 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
352 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 			     DR_REF (drb));
354 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
355 	}
356 
357       /* Add to list of ddrs that need to be tested at run-time.  */
358       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
359     }
360 
361   /* Known data dependence.  */
362   if (DDR_NUM_DIST_VECTS (ddr) == 0)
363     {
364       /* If user asserted safelen consecutive iterations can be
365 	 executed concurrently, assume independence.  */
366       if (loop->safelen >= 2)
367 	{
368 	  if (loop->safelen < *max_vf)
369 	    *max_vf = loop->safelen;
370 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371 	  return false;
372 	}
373 
374       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
375 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
376 	{
377 	  if (dump_enabled_p ())
378 	    {
379 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
380 			       "versioning for alias not supported for: "
381 			       "bad dist vector for ");
382 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
383 				 DR_REF (dra));
384 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
385 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
386 				 DR_REF (drb));
387 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
388 	    }
389 	  return true;
390 	}
391 
392       if (dump_enabled_p ())
393         {
394           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
395                            "versioning for alias required: "
396                            "bad dist vector for ");
397           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
398           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
399           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
400           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
401         }
402       /* Add to list of ddrs that need to be tested at run-time.  */
403       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
404     }
405 
406   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
407   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
408     {
409       int dist = dist_v[loop_depth];
410 
411       if (dump_enabled_p ())
412 	dump_printf_loc (MSG_NOTE, vect_location,
413                          "dependence distance  = %d.\n", dist);
414 
415       if (dist == 0)
416 	{
417 	  if (dump_enabled_p ())
418 	    {
419 	      dump_printf_loc (MSG_NOTE, vect_location,
420 	                       "dependence distance == 0 between ");
421 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
422 	      dump_printf (MSG_NOTE, " and ");
423 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
424 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
425 	    }
426 
427 	  /* When we perform grouped accesses and perform implicit CSE
428 	     by detecting equal accesses and doing disambiguation with
429 	     runtime alias tests like for
430 	        .. = a[i];
431 		.. = a[i+1];
432 		a[i] = ..;
433 		a[i+1] = ..;
434 		*p = ..;
435 		.. = a[i];
436 		.. = a[i+1];
437 	     where we will end up loading { a[i], a[i+1] } once, make
438 	     sure that inserting group loads before the first load and
439 	     stores after the last store will do the right thing.
440 	     Similar for groups like
441 	        a[i] = ...;
442 		... = a[i];
443 		a[i+1] = ...;
444 	     where loads from the group interleave with the store.  */
445 	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
446 	    {
447 	      if (dump_enabled_p ())
448 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
449 				 "READ_WRITE dependence in interleaving."
450 				 "\n");
451 	      return true;
452 	    }
453 
454 	  unsigned int step_prec = TYPE_PRECISION (TREE_TYPE (DR_STEP (dra)));
455 	  if (loop->safelen < 2
456 	      && !expr_not_equal_to (DR_STEP (dra), wi::zero (step_prec)))
457 	    {
458 	      if (dump_enabled_p ())
459 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
460 				 "step could be zero.\n");
461 	      return true;
462 	    }
463 
464 	  continue;
465 	}
466 
467       if (dist > 0 && DDR_REVERSED_P (ddr))
468 	{
469 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
470 	     reversed (to make distance vector positive), and the actual
471 	     distance is negative.  */
472 	  if (dump_enabled_p ())
473 	    dump_printf_loc (MSG_NOTE, vect_location,
474 	                     "dependence distance negative.\n");
475 	  /* When doing outer loop vectorization, we need to check if there is
476 	     a backward dependence at the inner loop level if the dependence
477 	     at the outer loop is reversed.  See PR81740.  */
478 	  if (nested_in_vect_loop_p (loop, DR_STMT (dra))
479 	      || nested_in_vect_loop_p (loop, DR_STMT (drb)))
480 	    {
481 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
482 							 DDR_LOOP_NEST (ddr));
483 	      if (dist_v[inner_depth] < 0)
484 		return true;
485 	    }
486 	  /* Record a negative dependence distance to later limit the
487 	     amount of stmt copying / unrolling we can perform.
488 	     Only need to handle read-after-write dependence.  */
489 	  if (DR_IS_READ (drb)
490 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
491 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
492 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
493 	  continue;
494 	}
495 
496       if (abs (dist) >= 2
497 	  && abs (dist) < *max_vf)
498 	{
499 	  /* The dependence distance requires reduction of the maximal
500 	     vectorization factor.  */
501 	  *max_vf = abs (dist);
502 	  if (dump_enabled_p ())
503 	    dump_printf_loc (MSG_NOTE, vect_location,
504 	                     "adjusting maximal vectorization factor to %i\n",
505 	                     *max_vf);
506 	}
507 
508       if (abs (dist) >= *max_vf)
509 	{
510 	  /* Dependence distance does not create dependence, as far as
511 	     vectorization is concerned, in this case.  */
512 	  if (dump_enabled_p ())
513 	    dump_printf_loc (MSG_NOTE, vect_location,
514 	                     "dependence distance >= VF.\n");
515 	  continue;
516 	}
517 
518       if (dump_enabled_p ())
519 	{
520 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 	               "not vectorized, possible dependence "
522 	               "between data-refs ");
523 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
524 	  dump_printf (MSG_NOTE,  " and ");
525 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
526 	  dump_printf (MSG_NOTE,  "\n");
527 	}
528 
529       return true;
530     }
531 
532   return false;
533 }
534 
535 /* Function vect_analyze_data_ref_dependences.
536 
537    Examine all the data references in the loop, and make sure there do not
538    exist any data dependences between them.  Set *MAX_VF according to
539    the maximum vectorization factor the data dependences allow.  */
540 
541 bool
542 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
543 {
544   unsigned int i;
545   struct data_dependence_relation *ddr;
546 
547   if (dump_enabled_p ())
548     dump_printf_loc (MSG_NOTE, vect_location,
549                      "=== vect_analyze_data_ref_dependences ===\n");
550 
551   LOOP_VINFO_DDRS (loop_vinfo)
552     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
553 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
554   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
555   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
556   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
557 				&LOOP_VINFO_DDRS (loop_vinfo),
558 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
559     return false;
560 
561   /* For epilogues we either have no aliases or alias versioning
562      was applied to original loop.  Therefore we may just get max_vf
563      using VF of original loop.  */
564   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
565     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
566   else
567     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
568       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
569 	return false;
570 
571   return true;
572 }
573 
574 
575 /* Function vect_slp_analyze_data_ref_dependence.
576 
577    Return TRUE if there (might) exist a dependence between a memory-reference
578    DRA and a memory-reference DRB.  When versioning for alias may check a
579    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
580    the data dependence.  */
581 
582 static bool
583 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
584 {
585   struct data_reference *dra = DDR_A (ddr);
586   struct data_reference *drb = DDR_B (ddr);
587 
588   /* We need to check dependences of statements marked as unvectorizable
589      as well, they still can prohibit vectorization.  */
590 
591   /* Independent data accesses.  */
592   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
593     return false;
594 
595   if (dra == drb)
596     return false;
597 
598   /* Read-read is OK.  */
599   if (DR_IS_READ (dra) && DR_IS_READ (drb))
600     return false;
601 
602   /* If dra and drb are part of the same interleaving chain consider
603      them independent.  */
604   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
605       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
606 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
607     return false;
608 
609   /* Unknown data dependence.  */
610   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
611     {
612       if  (dump_enabled_p ())
613 	{
614 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
615 			   "can't determine dependence between ");
616 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
617 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
618 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
619 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
620 	}
621     }
622   else if (dump_enabled_p ())
623     {
624       dump_printf_loc (MSG_NOTE, vect_location,
625 		       "determined dependence between ");
626       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
627       dump_printf (MSG_NOTE, " and ");
628       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
629       dump_printf (MSG_NOTE,  "\n");
630     }
631 
632   return true;
633 }
634 
635 
636 /* Analyze dependences involved in the transform of SLP NODE.  STORES
637    contain the vector of scalar stores of this instance if we are
638    disambiguating the loads.  */
639 
640 static bool
641 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
642 				   vec<gimple *> stores, gimple *last_store)
643 {
644   /* This walks over all stmts involved in the SLP load/store done
645      in NODE verifying we can sink them up to the last stmt in the
646      group.  */
647   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
648   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
649     {
650       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
651       if (access == last_access)
652 	continue;
653       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
654       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
655 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
656 	{
657 	  gimple *stmt = gsi_stmt (gsi);
658 	  if (! gimple_vuse (stmt)
659 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
660 	    continue;
661 
662 	  /* If we couldn't record a (single) data reference for this
663 	     stmt we have to give up.  */
664 	  /* ???  Here and below if dependence analysis fails we can resort
665 	     to the alias oracle which can handle more kinds of stmts.  */
666 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
667 	  if (!dr_b)
668 	    return false;
669 
670 	  bool dependent = false;
671 	  /* If we run into a store of this same instance (we've just
672 	     marked those) then delay dependence checking until we run
673 	     into the last store because this is where it will have
674 	     been sunk to (and we verify if we can do that as well).  */
675 	  if (gimple_visited_p (stmt))
676 	    {
677 	      if (stmt != last_store)
678 		continue;
679 	      unsigned i;
680 	      gimple *store;
681 	      FOR_EACH_VEC_ELT (stores, i, store)
682 		{
683 		  data_reference *store_dr
684 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
685 		  ddr_p ddr = initialize_data_dependence_relation
686 				(dr_a, store_dr, vNULL);
687 		  dependent = vect_slp_analyze_data_ref_dependence (ddr);
688 		  free_dependence_relation (ddr);
689 		  if (dependent)
690 		    break;
691 		}
692 	    }
693 	  else
694 	    {
695 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
696 							       dr_b, vNULL);
697 	      dependent = vect_slp_analyze_data_ref_dependence (ddr);
698 	      free_dependence_relation (ddr);
699 	    }
700 	  if (dependent)
701 	    return false;
702 	}
703     }
704   return true;
705 }
706 
707 
708 /* Function vect_analyze_data_ref_dependences.
709 
710    Examine all the data references in the basic-block, and make sure there
711    do not exist any data dependences between them.  Set *MAX_VF according to
712    the maximum vectorization factor the data dependences allow.  */
713 
714 bool
715 vect_slp_analyze_instance_dependence (slp_instance instance)
716 {
717   if (dump_enabled_p ())
718     dump_printf_loc (MSG_NOTE, vect_location,
719                      "=== vect_slp_analyze_instance_dependence ===\n");
720 
721   /* The stores of this instance are at the root of the SLP tree.  */
722   slp_tree store = SLP_INSTANCE_TREE (instance);
723   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
724     store = NULL;
725 
726   /* Verify we can sink stores to the vectorized stmt insert location.  */
727   gimple *last_store = NULL;
728   if (store)
729     {
730       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
731 	return false;
732 
733       /* Mark stores in this instance and remember the last one.  */
734       last_store = vect_find_last_scalar_stmt_in_slp (store);
735       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
736 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
737     }
738 
739   bool res = true;
740 
741   /* Verify we can sink loads to the vectorized stmt insert location,
742      special-casing stores of this instance.  */
743   slp_tree load;
744   unsigned int i;
745   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
746     if (! vect_slp_analyze_node_dependences (instance, load,
747 					     store
748 					     ? SLP_TREE_SCALAR_STMTS (store)
749 					     : vNULL, last_store))
750       {
751 	res = false;
752 	break;
753       }
754 
755   /* Unset the visited flag.  */
756   if (store)
757     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
758       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
759 
760   return res;
761 }
762 
763 /* Function vect_compute_data_ref_alignment
764 
765    Compute the misalignment of the data reference DR.
766 
767    Output:
768    1. If during the misalignment computation it is found that the data reference
769       cannot be vectorized then false is returned.
770    2. DR_MISALIGNMENT (DR) is defined.
771 
772    FOR NOW: No analysis is actually performed. Misalignment is calculated
773    only for trivial cases. TODO.  */
774 
775 bool
776 vect_compute_data_ref_alignment (struct data_reference *dr)
777 {
778   gimple *stmt = DR_STMT (dr);
779   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
780   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
781   struct loop *loop = NULL;
782   tree ref = DR_REF (dr);
783   tree vectype;
784   tree base, base_addr;
785   tree misalign = NULL_TREE;
786   tree aligned_to;
787   tree step;
788   unsigned HOST_WIDE_INT alignment;
789 
790   if (dump_enabled_p ())
791     dump_printf_loc (MSG_NOTE, vect_location,
792                      "vect_compute_data_ref_alignment:\n");
793 
794   if (loop_vinfo)
795     loop = LOOP_VINFO_LOOP (loop_vinfo);
796 
797   /* Initialize misalignment to unknown.  */
798   SET_DR_MISALIGNMENT (dr, -1);
799 
800   if (tree_fits_shwi_p (DR_STEP (dr)))
801     misalign = DR_INIT (dr);
802   aligned_to = DR_ALIGNED_TO (dr);
803   base_addr = DR_BASE_ADDRESS (dr);
804   vectype = STMT_VINFO_VECTYPE (stmt_info);
805 
806   /* In case the dataref is in an inner-loop of the loop that is being
807      vectorized (LOOP), we use the base and misalignment information
808      relative to the outer-loop (LOOP).  This is ok only if the misalignment
809      stays the same throughout the execution of the inner-loop, which is why
810      we have to check that the stride of the dataref in the inner-loop evenly
811      divides by the vector size.  */
812   if (loop && nested_in_vect_loop_p (loop, stmt))
813     {
814       tree step = DR_STEP (dr);
815 
816       if (tree_fits_shwi_p (step)
817 	  && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
818         {
819           if (dump_enabled_p ())
820             dump_printf_loc (MSG_NOTE, vect_location,
821                              "inner step divides the vector-size.\n");
822 	  misalign = STMT_VINFO_DR_INIT (stmt_info);
823 	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
824 	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
825         }
826       else
827 	{
828 	  if (dump_enabled_p ())
829 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 	                     "inner step doesn't divide the vector-size.\n");
831 	  misalign = NULL_TREE;
832 	}
833     }
834 
835   /* Similarly we can only use base and misalignment information relative to
836      an innermost loop if the misalignment stays the same throughout the
837      execution of the loop.  As above, this is the case if the stride of
838      the dataref evenly divides by the vector size.  */
839   else
840     {
841       tree step = DR_STEP (dr);
842       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
843 
844       if (tree_fits_shwi_p (step)
845 	  && ((tree_to_shwi (step) * vf)
846 	      % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
847 	{
848 	  if (dump_enabled_p ())
849 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850 	                     "step doesn't divide the vector-size.\n");
851 	  misalign = NULL_TREE;
852 	}
853     }
854 
855   /* To look at alignment of the base we have to preserve an inner MEM_REF
856      as that carries alignment information of the actual access.  */
857   base = ref;
858   while (handled_component_p (base))
859     base = TREE_OPERAND (base, 0);
860   unsigned int base_alignment = 0;
861   unsigned HOST_WIDE_INT base_bitpos;
862   get_object_alignment_1 (base, &base_alignment, &base_bitpos);
863   /* As data-ref analysis strips the MEM_REF down to its base operand
864      to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
865      adjust things to make base_alignment valid as the alignment of
866      DR_BASE_ADDRESS.  */
867   if (TREE_CODE (base) == MEM_REF)
868     {
869       /* Note all this only works if DR_BASE_ADDRESS is the same as
870 	 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
871 	 in other offsets.  We need to rework DR to compute the alingment
872 	 of DR_BASE_ADDRESS as long as all information is still available.  */
873       if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
874 	{
875 	  base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
876 	  base_bitpos &= (base_alignment - 1);
877 	}
878       else
879 	base_bitpos = BITS_PER_UNIT;
880     }
881   if (base_bitpos != 0)
882     base_alignment = base_bitpos & -base_bitpos;
883   /* Also look at the alignment of the base address DR analysis
884      computed.  */
885   unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
886   if (base_addr_alignment > base_alignment)
887     base_alignment = base_addr_alignment;
888 
889   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
890     DR_VECT_AUX (dr)->base_element_aligned = true;
891 
892   alignment = TYPE_ALIGN_UNIT (vectype);
893 
894   if ((compare_tree_int (aligned_to, alignment) < 0)
895       || !misalign)
896     {
897       if (dump_enabled_p ())
898 	{
899 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 	                   "Unknown alignment for access: ");
901 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
902 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
903 	}
904       return true;
905     }
906 
907   if (base_alignment < TYPE_ALIGN (vectype))
908     {
909       base = base_addr;
910       if (TREE_CODE (base) == ADDR_EXPR)
911 	base = TREE_OPERAND (base, 0);
912       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
913 	{
914 	  if (dump_enabled_p ())
915 	    {
916 	      dump_printf_loc (MSG_NOTE, vect_location,
917 	                       "can't force alignment of ref: ");
918 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
919 	      dump_printf (MSG_NOTE, "\n");
920 	    }
921 	  return true;
922 	}
923 
924       if (DECL_USER_ALIGN (base))
925 	{
926 	  if (dump_enabled_p ())
927 	    {
928 	      dump_printf_loc (MSG_NOTE, vect_location,
929 			       "not forcing alignment of user-aligned "
930 			       "variable: ");
931 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
932 	      dump_printf (MSG_NOTE, "\n");
933 	    }
934 	  return true;
935 	}
936 
937       /* Force the alignment of the decl.
938 	 NOTE: This is the only change to the code we make during
939 	 the analysis phase, before deciding to vectorize the loop.  */
940       if (dump_enabled_p ())
941         {
942           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
943           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
944           dump_printf (MSG_NOTE, "\n");
945         }
946 
947       DR_VECT_AUX (dr)->base_decl = base;
948       DR_VECT_AUX (dr)->base_misaligned = true;
949       DR_VECT_AUX (dr)->base_element_aligned = true;
950     }
951 
952   if (loop && nested_in_vect_loop_p (loop, stmt))
953     step = STMT_VINFO_DR_STEP (stmt_info);
954   else
955     step = DR_STEP (dr);
956   /* If this is a backward running DR then first access in the larger
957      vectype actually is N-1 elements before the address in the DR.
958      Adjust misalign accordingly.  */
959   if (tree_int_cst_sgn (step) < 0)
960     {
961       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
962       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
963 	 otherwise we wouldn't be here.  */
964       offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
965       /* PLUS because STEP was negative.  */
966       misalign = size_binop (PLUS_EXPR, misalign, offset);
967     }
968 
969   SET_DR_MISALIGNMENT (dr,
970 		       wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
971 
972   if (dump_enabled_p ())
973     {
974       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
975                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
976       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
977       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
978     }
979 
980   return true;
981 }
982 
983 
984 /* Function vect_update_misalignment_for_peel
985 
986    DR - the data reference whose misalignment is to be adjusted.
987    DR_PEEL - the data reference whose misalignment is being made
988              zero in the vector loop by the peel.
989    NPEEL - the number of iterations in the peel loop if the misalignment
990            of DR_PEEL is known at compile time.  */
991 
992 static void
993 vect_update_misalignment_for_peel (struct data_reference *dr,
994                                    struct data_reference *dr_peel, int npeel)
995 {
996   unsigned int i;
997   vec<dr_p> same_align_drs;
998   struct data_reference *current_dr;
999   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
1000   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
1001   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1002   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1003 
1004  /* For interleaved data accesses the step in the loop must be multiplied by
1005      the size of the interleaving group.  */
1006   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1007     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1008   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1009     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1010 
1011   /* It can be assumed that the data refs with the same alignment as dr_peel
1012      are aligned in the vector loop.  */
1013   same_align_drs
1014     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1015   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
1016     {
1017       if (current_dr != dr)
1018         continue;
1019       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
1020                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
1021       SET_DR_MISALIGNMENT (dr, 0);
1022       return;
1023     }
1024 
1025   if (known_alignment_for_access_p (dr)
1026       && known_alignment_for_access_p (dr_peel))
1027     {
1028       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1029       int misal = DR_MISALIGNMENT (dr);
1030       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1031       misal += negative ? -npeel * dr_size : npeel * dr_size;
1032       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1033       SET_DR_MISALIGNMENT (dr, misal);
1034       return;
1035     }
1036 
1037   if (dump_enabled_p ())
1038     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
1039   SET_DR_MISALIGNMENT (dr, -1);
1040 }
1041 
1042 
1043 /* Function verify_data_ref_alignment
1044 
1045    Return TRUE if DR can be handled with respect to alignment.  */
1046 
1047 static bool
1048 verify_data_ref_alignment (data_reference_p dr)
1049 {
1050   enum dr_alignment_support supportable_dr_alignment
1051     = vect_supportable_dr_alignment (dr, false);
1052   if (!supportable_dr_alignment)
1053     {
1054       if (dump_enabled_p ())
1055 	{
1056 	  if (DR_IS_READ (dr))
1057 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058 			     "not vectorized: unsupported unaligned load.");
1059 	  else
1060 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 			     "not vectorized: unsupported unaligned "
1062 			     "store.");
1063 
1064 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1065 			     DR_REF (dr));
1066 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1067 	}
1068       return false;
1069     }
1070 
1071   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1072     dump_printf_loc (MSG_NOTE, vect_location,
1073 		     "Vectorizing an unaligned access.\n");
1074 
1075   return true;
1076 }
1077 
1078 /* Function vect_verify_datarefs_alignment
1079 
1080    Return TRUE if all data references in the loop can be
1081    handled with respect to alignment.  */
1082 
1083 bool
1084 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1085 {
1086   vec<data_reference_p> datarefs = vinfo->datarefs;
1087   struct data_reference *dr;
1088   unsigned int i;
1089 
1090   FOR_EACH_VEC_ELT (datarefs, i, dr)
1091     {
1092       gimple *stmt = DR_STMT (dr);
1093       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1094 
1095       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1096 	continue;
1097 
1098       /* For interleaving, only the alignment of the first access matters.   */
1099       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1100 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1101 	continue;
1102 
1103       /* Strided accesses perform only component accesses, alignment is
1104 	 irrelevant for them.  */
1105       if (STMT_VINFO_STRIDED_P (stmt_info)
1106 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1107 	continue;
1108 
1109       if (! verify_data_ref_alignment (dr))
1110 	return false;
1111     }
1112 
1113   return true;
1114 }
1115 
1116 /* Given an memory reference EXP return whether its alignment is less
1117    than its size.  */
1118 
1119 static bool
1120 not_size_aligned (tree exp)
1121 {
1122   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1123     return true;
1124 
1125   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1126 	  > get_object_alignment (exp));
1127 }
1128 
1129 /* Function vector_alignment_reachable_p
1130 
1131    Return true if vector alignment for DR is reachable by peeling
1132    a few loop iterations.  Return false otherwise.  */
1133 
1134 static bool
1135 vector_alignment_reachable_p (struct data_reference *dr)
1136 {
1137   gimple *stmt = DR_STMT (dr);
1138   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1139   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1140 
1141   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1142     {
1143       /* For interleaved access we peel only if number of iterations in
1144 	 the prolog loop ({VF - misalignment}), is a multiple of the
1145 	 number of the interleaved accesses.  */
1146       int elem_size, mis_in_elements;
1147       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1148 
1149       /* FORNOW: handle only known alignment.  */
1150       if (!known_alignment_for_access_p (dr))
1151 	return false;
1152 
1153       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1154       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1155 
1156       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1157 	return false;
1158     }
1159 
1160   /* If misalignment is known at the compile time then allow peeling
1161      only if natural alignment is reachable through peeling.  */
1162   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1163     {
1164       HOST_WIDE_INT elmsize =
1165 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1166       if (dump_enabled_p ())
1167 	{
1168 	  dump_printf_loc (MSG_NOTE, vect_location,
1169 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1170 	  dump_printf (MSG_NOTE,
1171 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1172 	}
1173       if (DR_MISALIGNMENT (dr) % elmsize)
1174 	{
1175 	  if (dump_enabled_p ())
1176 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177 	                     "data size does not divide the misalignment.\n");
1178 	  return false;
1179 	}
1180     }
1181 
1182   if (!known_alignment_for_access_p (dr))
1183     {
1184       tree type = TREE_TYPE (DR_REF (dr));
1185       bool is_packed = not_size_aligned (DR_REF (dr));
1186       if (dump_enabled_p ())
1187 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 	                 "Unknown misalignment, %snaturally aligned\n",
1189 			 is_packed ? "not " : "");
1190       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1191     }
1192 
1193   return true;
1194 }
1195 
1196 
1197 /* Calculate the cost of the memory access represented by DR.  */
1198 
1199 static void
1200 vect_get_data_access_cost (struct data_reference *dr,
1201                            unsigned int *inside_cost,
1202                            unsigned int *outside_cost,
1203 			   stmt_vector_for_cost *body_cost_vec)
1204 {
1205   gimple *stmt = DR_STMT (dr);
1206   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1207   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1208   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1209   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1210   int ncopies = vf / nunits;
1211 
1212   if (DR_IS_READ (dr))
1213     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1214 			NULL, body_cost_vec, false);
1215   else
1216     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1217 
1218   if (dump_enabled_p ())
1219     dump_printf_loc (MSG_NOTE, vect_location,
1220                      "vect_get_data_access_cost: inside_cost = %d, "
1221                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1222 }
1223 
1224 
1225 typedef struct _vect_peel_info
1226 {
1227   struct data_reference *dr;
1228   int npeel;
1229   unsigned int count;
1230 } *vect_peel_info;
1231 
1232 typedef struct _vect_peel_extended_info
1233 {
1234   struct _vect_peel_info peel_info;
1235   unsigned int inside_cost;
1236   unsigned int outside_cost;
1237   stmt_vector_for_cost body_cost_vec;
1238 } *vect_peel_extended_info;
1239 
1240 
1241 /* Peeling hashtable helpers.  */
1242 
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1244 {
1245   static inline hashval_t hash (const _vect_peel_info *);
1246   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1247 };
1248 
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1251 {
1252   return (hashval_t) peel_info->npeel;
1253 }
1254 
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1257 {
1258   return (a->npeel == b->npeel);
1259 }
1260 
1261 
1262 /* Insert DR into peeling hash table with NPEEL as key.  */
1263 
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1267                           int npeel)
1268 {
1269   struct _vect_peel_info elem, *slot;
1270   _vect_peel_info **new_slot;
1271   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1272 
1273   elem.npeel = npeel;
1274   slot = peeling_htab->find (&elem);
1275   if (slot)
1276     slot->count++;
1277   else
1278     {
1279       slot = XNEW (struct _vect_peel_info);
1280       slot->npeel = npeel;
1281       slot->dr = dr;
1282       slot->count = 1;
1283       new_slot = peeling_htab->find_slot (slot, INSERT);
1284       *new_slot = slot;
1285     }
1286 
1287   if (!supportable_dr_alignment
1288       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1289     slot->count += VECT_MAX_COST;
1290 }
1291 
1292 
1293 /* Traverse peeling hash table to find peeling option that aligns maximum
1294    number of data accesses.  */
1295 
1296 int
1297 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1298 				     _vect_peel_extended_info *max)
1299 {
1300   vect_peel_info elem = *slot;
1301 
1302   if (elem->count > max->peel_info.count
1303       || (elem->count == max->peel_info.count
1304           && max->peel_info.npeel > elem->npeel))
1305     {
1306       max->peel_info.npeel = elem->npeel;
1307       max->peel_info.count = elem->count;
1308       max->peel_info.dr = elem->dr;
1309     }
1310 
1311   return 1;
1312 }
1313 
1314 
1315 /* Traverse peeling hash table and calculate cost for each peeling option.
1316    Find the one with the lowest cost.  */
1317 
1318 int
1319 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1320 				   _vect_peel_extended_info *min)
1321 {
1322   vect_peel_info elem = *slot;
1323   int save_misalignment, dummy;
1324   unsigned int inside_cost = 0, outside_cost = 0, i;
1325   gimple *stmt = DR_STMT (elem->dr);
1326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1327   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1328   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1329   struct data_reference *dr;
1330   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1331 
1332   prologue_cost_vec.create (2);
1333   body_cost_vec.create (2);
1334   epilogue_cost_vec.create (2);
1335 
1336   FOR_EACH_VEC_ELT (datarefs, i, dr)
1337     {
1338       stmt = DR_STMT (dr);
1339       stmt_info = vinfo_for_stmt (stmt);
1340       /* For interleaving, only the alignment of the first access
1341          matters.  */
1342       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1343           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1344         continue;
1345 
1346       /* Strided accesses perform only component accesses, alignment is
1347          irrelevant for them.  */
1348       if (STMT_VINFO_STRIDED_P (stmt_info)
1349 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1350 	continue;
1351 
1352       save_misalignment = DR_MISALIGNMENT (dr);
1353       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1354       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1355 				 &body_cost_vec);
1356       SET_DR_MISALIGNMENT (dr, save_misalignment);
1357     }
1358 
1359   outside_cost += vect_get_known_peeling_cost
1360     (loop_vinfo, elem->npeel, &dummy,
1361      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1362      &prologue_cost_vec, &epilogue_cost_vec);
1363 
1364   /* Prologue and epilogue costs are added to the target model later.
1365      These costs depend only on the scalar iteration cost, the
1366      number of peeling iterations finally chosen, and the number of
1367      misaligned statements.  So discard the information found here.  */
1368   prologue_cost_vec.release ();
1369   epilogue_cost_vec.release ();
1370 
1371   if (inside_cost < min->inside_cost
1372       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1373     {
1374       min->inside_cost = inside_cost;
1375       min->outside_cost = outside_cost;
1376       min->body_cost_vec.release ();
1377       min->body_cost_vec = body_cost_vec;
1378       min->peel_info.dr = elem->dr;
1379       min->peel_info.npeel = elem->npeel;
1380     }
1381   else
1382     body_cost_vec.release ();
1383 
1384   return 1;
1385 }
1386 
1387 
1388 /* Choose best peeling option by traversing peeling hash table and either
1389    choosing an option with the lowest cost (if cost model is enabled) or the
1390    option that aligns as many accesses as possible.  */
1391 
1392 static struct data_reference *
1393 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1394 				       loop_vec_info loop_vinfo,
1395                                        unsigned int *npeel,
1396 				       stmt_vector_for_cost *body_cost_vec)
1397 {
1398    struct _vect_peel_extended_info res;
1399 
1400    res.peel_info.dr = NULL;
1401    res.body_cost_vec = stmt_vector_for_cost ();
1402 
1403    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1404      {
1405        res.inside_cost = INT_MAX;
1406        res.outside_cost = INT_MAX;
1407        peeling_htab->traverse <_vect_peel_extended_info *,
1408 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1409      }
1410    else
1411      {
1412        res.peel_info.count = 0;
1413        peeling_htab->traverse <_vect_peel_extended_info *,
1414 	   		       vect_peeling_hash_get_most_frequent> (&res);
1415      }
1416 
1417    *npeel = res.peel_info.npeel;
1418    *body_cost_vec = res.body_cost_vec;
1419    return res.peel_info.dr;
1420 }
1421 
1422 
1423 /* Function vect_enhance_data_refs_alignment
1424 
1425    This pass will use loop versioning and loop peeling in order to enhance
1426    the alignment of data references in the loop.
1427 
1428    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1429    original loop is to be vectorized.  Any other loops that are created by
1430    the transformations performed in this pass - are not supposed to be
1431    vectorized.  This restriction will be relaxed.
1432 
1433    This pass will require a cost model to guide it whether to apply peeling
1434    or versioning or a combination of the two.  For example, the scheme that
1435    intel uses when given a loop with several memory accesses, is as follows:
1436    choose one memory access ('p') which alignment you want to force by doing
1437    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1438    other accesses are not necessarily aligned, or (2) use loop versioning to
1439    generate one loop in which all accesses are aligned, and another loop in
1440    which only 'p' is necessarily aligned.
1441 
1442    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1443    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1444    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1445 
1446    Devising a cost model is the most critical aspect of this work.  It will
1447    guide us on which access to peel for, whether to use loop versioning, how
1448    many versions to create, etc.  The cost model will probably consist of
1449    generic considerations as well as target specific considerations (on
1450    powerpc for example, misaligned stores are more painful than misaligned
1451    loads).
1452 
1453    Here are the general steps involved in alignment enhancements:
1454 
1455      -- original loop, before alignment analysis:
1456 	for (i=0; i<N; i++){
1457 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1458 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1459 	}
1460 
1461      -- After vect_compute_data_refs_alignment:
1462 	for (i=0; i<N; i++){
1463 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1464 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1465 	}
1466 
1467      -- Possibility 1: we do loop versioning:
1468      if (p is aligned) {
1469 	for (i=0; i<N; i++){	# loop 1A
1470 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1471 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1472 	}
1473      }
1474      else {
1475 	for (i=0; i<N; i++){	# loop 1B
1476 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1477 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1478 	}
1479      }
1480 
1481      -- Possibility 2: we do loop peeling:
1482      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1483 	x = q[i];
1484 	p[i] = y;
1485      }
1486      for (i = 3; i < N; i++){	# loop 2A
1487 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1488 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1489      }
1490 
1491      -- Possibility 3: combination of loop peeling and versioning:
1492      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1493 	x = q[i];
1494 	p[i] = y;
1495      }
1496      if (p is aligned) {
1497 	for (i = 3; i<N; i++){	# loop 3A
1498 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1499 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1500 	}
1501      }
1502      else {
1503 	for (i = 3; i<N; i++){	# loop 3B
1504 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1505 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1506 	}
1507      }
1508 
1509      These loops are later passed to loop_transform to be vectorized.  The
1510      vectorizer will use the alignment information to guide the transformation
1511      (whether to generate regular loads/stores, or with special handling for
1512      misalignment).  */
1513 
1514 bool
1515 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1516 {
1517   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1518   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1519   enum dr_alignment_support supportable_dr_alignment;
1520   struct data_reference *dr0 = NULL, *first_store = NULL;
1521   struct data_reference *dr;
1522   unsigned int i, j;
1523   bool do_peeling = false;
1524   bool do_versioning = false;
1525   bool stat;
1526   gimple *stmt;
1527   stmt_vec_info stmt_info;
1528   unsigned int npeel = 0;
1529   bool all_misalignments_unknown = true;
1530   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1531   unsigned possible_npeel_number = 1;
1532   tree vectype;
1533   unsigned int nelements, mis, same_align_drs_max = 0;
1534   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1535   hash_table<peel_info_hasher> peeling_htab (1);
1536 
1537   if (dump_enabled_p ())
1538     dump_printf_loc (MSG_NOTE, vect_location,
1539                      "=== vect_enhance_data_refs_alignment ===\n");
1540 
1541   /* Reset data so we can safely be called multiple times.  */
1542   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1543   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1544 
1545   /* While cost model enhancements are expected in the future, the high level
1546      view of the code at this time is as follows:
1547 
1548      A) If there is a misaligned access then see if peeling to align
1549         this access can make all data references satisfy
1550         vect_supportable_dr_alignment.  If so, update data structures
1551         as needed and return true.
1552 
1553      B) If peeling wasn't possible and there is a data reference with an
1554         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1555         then see if loop versioning checks can be used to make all data
1556         references satisfy vect_supportable_dr_alignment.  If so, update
1557         data structures as needed and return true.
1558 
1559      C) If neither peeling nor versioning were successful then return false if
1560         any data reference does not satisfy vect_supportable_dr_alignment.
1561 
1562      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1563 
1564      Note, Possibility 3 above (which is peeling and versioning together) is not
1565      being done at this time.  */
1566 
1567   /* (1) Peeling to force alignment.  */
1568 
1569   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1570      Considerations:
1571      + How many accesses will become aligned due to the peeling
1572      - How many accesses will become unaligned due to the peeling,
1573        and the cost of misaligned accesses.
1574      - The cost of peeling (the extra runtime checks, the increase
1575        in code size).  */
1576 
1577   FOR_EACH_VEC_ELT (datarefs, i, dr)
1578     {
1579       stmt = DR_STMT (dr);
1580       stmt_info = vinfo_for_stmt (stmt);
1581 
1582       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1583 	continue;
1584 
1585       /* For interleaving, only the alignment of the first access
1586          matters.  */
1587       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1588           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1589         continue;
1590 
1591       /* For invariant accesses there is nothing to enhance.  */
1592       if (integer_zerop (DR_STEP (dr)))
1593 	continue;
1594 
1595       /* Strided accesses perform only component accesses, alignment is
1596 	 irrelevant for them.  */
1597       if (STMT_VINFO_STRIDED_P (stmt_info)
1598 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1599 	continue;
1600 
1601       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1602       do_peeling = vector_alignment_reachable_p (dr);
1603       if (do_peeling)
1604         {
1605           if (known_alignment_for_access_p (dr))
1606             {
1607               unsigned int npeel_tmp;
1608 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1609 						    size_zero_node) < 0;
1610 
1611               /* Save info about DR in the hash table.  */
1612               vectype = STMT_VINFO_VECTYPE (stmt_info);
1613               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1614               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1615                                                 TREE_TYPE (DR_REF (dr))));
1616               npeel_tmp = (negative
1617 			   ? (mis - nelements) : (nelements - mis))
1618 		  & (nelements - 1);
1619 
1620               /* For multiple types, it is possible that the bigger type access
1621                  will have more than one peeling option.  E.g., a loop with two
1622                  types: one of size (vector size / 4), and the other one of
1623                  size (vector size / 8).  Vectorization factor will 8.  If both
1624                  access are misaligned by 3, the first one needs one scalar
1625                  iteration to be aligned, and the second one needs 5.  But the
1626 		 first one will be aligned also by peeling 5 scalar
1627                  iterations, and in that case both accesses will be aligned.
1628                  Hence, except for the immediate peeling amount, we also want
1629                  to try to add full vector size, while we don't exceed
1630                  vectorization factor.
1631                  We do this automatically for cost model, since we calculate cost
1632                  for every peeling option.  */
1633               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1634 		{
1635 		  if (STMT_SLP_TYPE (stmt_info))
1636 		    possible_npeel_number
1637 		      = (vf * GROUP_SIZE (stmt_info)) / nelements;
1638 		  else
1639 		    possible_npeel_number = vf / nelements;
1640 		}
1641 
1642               /* Handle the aligned case. We may decide to align some other
1643                  access, making DR unaligned.  */
1644               if (DR_MISALIGNMENT (dr) == 0)
1645                 {
1646                   npeel_tmp = 0;
1647                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1648                     possible_npeel_number++;
1649                 }
1650 
1651               for (j = 0; j < possible_npeel_number; j++)
1652                 {
1653                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1654 					    dr, npeel_tmp);
1655                   npeel_tmp += nelements;
1656                 }
1657 
1658               all_misalignments_unknown = false;
1659               /* Data-ref that was chosen for the case that all the
1660                  misalignments are unknown is not relevant anymore, since we
1661                  have a data-ref with known alignment.  */
1662               dr0 = NULL;
1663             }
1664           else
1665             {
1666               /* If we don't know any misalignment values, we prefer
1667                  peeling for data-ref that has the maximum number of data-refs
1668                  with the same alignment, unless the target prefers to align
1669                  stores over load.  */
1670               if (all_misalignments_unknown)
1671                 {
1672 		  unsigned same_align_drs
1673 		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1674                   if (!dr0
1675 		      || same_align_drs_max < same_align_drs)
1676                     {
1677                       same_align_drs_max = same_align_drs;
1678                       dr0 = dr;
1679                     }
1680 		  /* For data-refs with the same number of related
1681 		     accesses prefer the one where the misalign
1682 		     computation will be invariant in the outermost loop.  */
1683 		  else if (same_align_drs_max == same_align_drs)
1684 		    {
1685 		      struct loop *ivloop0, *ivloop;
1686 		      ivloop0 = outermost_invariant_loop_for_expr
1687 			  (loop, DR_BASE_ADDRESS (dr0));
1688 		      ivloop = outermost_invariant_loop_for_expr
1689 			  (loop, DR_BASE_ADDRESS (dr));
1690 		      if ((ivloop && !ivloop0)
1691 			  || (ivloop && ivloop0
1692 			      && flow_loop_nested_p (ivloop, ivloop0)))
1693 			dr0 = dr;
1694 		    }
1695 
1696                   if (!first_store && DR_IS_WRITE (dr))
1697                     first_store = dr;
1698                 }
1699 
1700               /* If there are both known and unknown misaligned accesses in the
1701                  loop, we choose peeling amount according to the known
1702                  accesses.  */
1703               if (!supportable_dr_alignment)
1704                 {
1705                   dr0 = dr;
1706                   if (!first_store && DR_IS_WRITE (dr))
1707                     first_store = dr;
1708                 }
1709             }
1710         }
1711       else
1712         {
1713           if (!aligned_access_p (dr))
1714             {
1715               if (dump_enabled_p ())
1716                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                                  "vector alignment may not be reachable\n");
1718               break;
1719             }
1720         }
1721     }
1722 
1723   /* Check if we can possibly peel the loop.  */
1724   if (!vect_can_advance_ivs_p (loop_vinfo)
1725       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1726       || loop->inner)
1727     do_peeling = false;
1728 
1729   if (do_peeling
1730       && all_misalignments_unknown
1731       && vect_supportable_dr_alignment (dr0, false))
1732     {
1733       /* Check if the target requires to prefer stores over loads, i.e., if
1734          misaligned stores are more expensive than misaligned loads (taking
1735          drs with same alignment into account).  */
1736       if (first_store && DR_IS_READ (dr0))
1737         {
1738           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1739           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1740           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1741           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1742 	  stmt_vector_for_cost dummy;
1743 	  dummy.create (2);
1744 
1745           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1746 				     &dummy);
1747           vect_get_data_access_cost (first_store, &store_inside_cost,
1748 				     &store_outside_cost, &dummy);
1749 
1750 	  dummy.release ();
1751 
1752           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1753              aligning the load DR0).  */
1754           load_inside_penalty = store_inside_cost;
1755           load_outside_penalty = store_outside_cost;
1756           for (i = 0;
1757 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1758 			  DR_STMT (first_store))).iterate (i, &dr);
1759                i++)
1760             if (DR_IS_READ (dr))
1761               {
1762                 load_inside_penalty += load_inside_cost;
1763                 load_outside_penalty += load_outside_cost;
1764               }
1765             else
1766               {
1767                 load_inside_penalty += store_inside_cost;
1768                 load_outside_penalty += store_outside_cost;
1769               }
1770 
1771           /* Calculate the penalty for leaving DR0 unaligned (by
1772              aligning the FIRST_STORE).  */
1773           store_inside_penalty = load_inside_cost;
1774           store_outside_penalty = load_outside_cost;
1775           for (i = 0;
1776 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1777 		      DR_STMT (dr0))).iterate (i, &dr);
1778                i++)
1779             if (DR_IS_READ (dr))
1780               {
1781                 store_inside_penalty += load_inside_cost;
1782                 store_outside_penalty += load_outside_cost;
1783               }
1784             else
1785               {
1786                 store_inside_penalty += store_inside_cost;
1787                 store_outside_penalty += store_outside_cost;
1788               }
1789 
1790           if (load_inside_penalty > store_inside_penalty
1791               || (load_inside_penalty == store_inside_penalty
1792                   && load_outside_penalty > store_outside_penalty))
1793             dr0 = first_store;
1794         }
1795 
1796       /* In case there are only loads with different unknown misalignments, use
1797          peeling only if it may help to align other accesses in the loop or
1798 	 if it may help improving load bandwith when we'd end up using
1799 	 unaligned loads.  */
1800       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1801       if (!first_store
1802 	  && !STMT_VINFO_SAME_ALIGN_REFS (
1803 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1804 	  && (vect_supportable_dr_alignment (dr0, false)
1805 	      != dr_unaligned_supported
1806 	      || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1807 		  == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1808         do_peeling = false;
1809     }
1810 
1811   if (do_peeling && !dr0)
1812     {
1813       /* Peeling is possible, but there is no data access that is not supported
1814          unless aligned. So we try to choose the best possible peeling.  */
1815 
1816       /* We should get here only if there are drs with known misalignment.  */
1817       gcc_assert (!all_misalignments_unknown);
1818 
1819       /* Choose the best peeling from the hash table.  */
1820       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1821 						   loop_vinfo, &npeel,
1822 						   &body_cost_vec);
1823       if (!dr0 || !npeel)
1824         do_peeling = false;
1825     }
1826 
1827   if (do_peeling)
1828     {
1829       stmt = DR_STMT (dr0);
1830       stmt_info = vinfo_for_stmt (stmt);
1831       vectype = STMT_VINFO_VECTYPE (stmt_info);
1832       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1833 
1834       if (known_alignment_for_access_p (dr0))
1835         {
1836 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1837 						size_zero_node) < 0;
1838           if (!npeel)
1839             {
1840               /* Since it's known at compile time, compute the number of
1841                  iterations in the peeled loop (the peeling factor) for use in
1842                  updating DR_MISALIGNMENT values.  The peeling factor is the
1843                  vectorization factor minus the misalignment as an element
1844                  count.  */
1845               mis = DR_MISALIGNMENT (dr0);
1846               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1847               npeel = ((negative ? mis - nelements : nelements - mis)
1848 		       & (nelements - 1));
1849             }
1850 
1851 	  /* For interleaved data access every iteration accesses all the
1852 	     members of the group, therefore we divide the number of iterations
1853 	     by the group size.  */
1854 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1855 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1856 	    npeel /= GROUP_SIZE (stmt_info);
1857 
1858           if (dump_enabled_p ())
1859             dump_printf_loc (MSG_NOTE, vect_location,
1860                              "Try peeling by %d\n", npeel);
1861         }
1862 
1863       /* Ensure that all data refs can be vectorized after the peel.  */
1864       FOR_EACH_VEC_ELT (datarefs, i, dr)
1865         {
1866           int save_misalignment;
1867 
1868 	  if (dr == dr0)
1869 	    continue;
1870 
1871 	  stmt = DR_STMT (dr);
1872 	  stmt_info = vinfo_for_stmt (stmt);
1873 	  /* For interleaving, only the alignment of the first access
1874             matters.  */
1875 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1876 	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1877 	    continue;
1878 
1879 	  /* Strided accesses perform only component accesses, alignment is
1880 	     irrelevant for them.  */
1881 	  if (STMT_VINFO_STRIDED_P (stmt_info)
1882 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1883 	    continue;
1884 
1885 	  save_misalignment = DR_MISALIGNMENT (dr);
1886 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1887 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1888 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1889 
1890 	  if (!supportable_dr_alignment)
1891 	    {
1892 	      do_peeling = false;
1893 	      break;
1894 	    }
1895 	}
1896 
1897       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1898         {
1899           stat = vect_verify_datarefs_alignment (loop_vinfo);
1900           if (!stat)
1901             do_peeling = false;
1902           else
1903 	    {
1904 	      body_cost_vec.release ();
1905 	      return stat;
1906 	    }
1907         }
1908 
1909       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1910       if (do_peeling)
1911         {
1912           unsigned max_allowed_peel
1913             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1914           if (max_allowed_peel != (unsigned)-1)
1915             {
1916               unsigned max_peel = npeel;
1917               if (max_peel == 0)
1918                 {
1919 		  gimple *dr_stmt = DR_STMT (dr0);
1920                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1921                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1922                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1923                 }
1924               if (max_peel > max_allowed_peel)
1925                 {
1926                   do_peeling = false;
1927                   if (dump_enabled_p ())
1928                     dump_printf_loc (MSG_NOTE, vect_location,
1929                         "Disable peeling, max peels reached: %d\n", max_peel);
1930                 }
1931             }
1932         }
1933 
1934       /* Cost model #2 - if peeling may result in a remaining loop not
1935 	 iterating enough to be vectorized then do not peel.  */
1936       if (do_peeling
1937 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1938 	{
1939 	  unsigned max_peel
1940 	    = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1941 	  if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1942 	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1943 	    do_peeling = false;
1944 	}
1945 
1946       if (do_peeling)
1947         {
1948           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1949              If the misalignment of DR_i is identical to that of dr0 then set
1950              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1951              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1952              by the peeling factor times the element size of DR_i (MOD the
1953              vectorization factor times the size).  Otherwise, the
1954              misalignment of DR_i must be set to unknown.  */
1955 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1956 	    if (dr != dr0)
1957 	      {
1958 		/* Strided accesses perform only component accesses, alignment
1959 		   is irrelevant for them.  */
1960 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
1961 		if (STMT_VINFO_STRIDED_P (stmt_info)
1962 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1963 		  continue;
1964 
1965 		vect_update_misalignment_for_peel (dr, dr0, npeel);
1966 	      }
1967 
1968           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1969           if (npeel)
1970             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1971           else
1972             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1973 	      = DR_MISALIGNMENT (dr0);
1974 	  SET_DR_MISALIGNMENT (dr0, 0);
1975 	  if (dump_enabled_p ())
1976             {
1977               dump_printf_loc (MSG_NOTE, vect_location,
1978                                "Alignment of access forced using peeling.\n");
1979               dump_printf_loc (MSG_NOTE, vect_location,
1980                                "Peeling for alignment will be applied.\n");
1981             }
1982 	  /* The inside-loop cost will be accounted for in vectorizable_load
1983 	     and vectorizable_store correctly with adjusted alignments.
1984 	     Drop the body_cst_vec on the floor here.  */
1985 	  body_cost_vec.release ();
1986 
1987 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
1988 	  gcc_assert (stat);
1989           return stat;
1990         }
1991     }
1992 
1993   body_cost_vec.release ();
1994 
1995   /* (2) Versioning to force alignment.  */
1996 
1997   /* Try versioning if:
1998      1) optimize loop for speed
1999      2) there is at least one unsupported misaligned data ref with an unknown
2000         misalignment, and
2001      3) all misaligned data refs with a known misalignment are supported, and
2002      4) the number of runtime alignment checks is within reason.  */
2003 
2004   do_versioning =
2005 	optimize_loop_nest_for_speed_p (loop)
2006 	&& (!loop->inner); /* FORNOW */
2007 
2008   if (do_versioning)
2009     {
2010       FOR_EACH_VEC_ELT (datarefs, i, dr)
2011         {
2012 	  stmt = DR_STMT (dr);
2013 	  stmt_info = vinfo_for_stmt (stmt);
2014 
2015 	  /* For interleaving, only the alignment of the first access
2016 	     matters.  */
2017 	  if (aligned_access_p (dr)
2018 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2019 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2020 	    continue;
2021 
2022 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2023 	    {
2024 	      /* Strided loads perform only component accesses, alignment is
2025 		 irrelevant for them.  */
2026 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2027 		continue;
2028 	      do_versioning = false;
2029 	      break;
2030 	    }
2031 
2032 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2033 
2034           if (!supportable_dr_alignment)
2035             {
2036 	      gimple *stmt;
2037               int mask;
2038               tree vectype;
2039 
2040               if (known_alignment_for_access_p (dr)
2041                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2042                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2043                 {
2044                   do_versioning = false;
2045                   break;
2046                 }
2047 
2048               stmt = DR_STMT (dr);
2049               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2050               gcc_assert (vectype);
2051 
2052               /* The rightmost bits of an aligned address must be zeros.
2053                  Construct the mask needed for this test.  For example,
2054                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2055                  mask must be 15 = 0xf. */
2056               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2057 
2058               /* FORNOW: use the same mask to test all potentially unaligned
2059                  references in the loop.  The vectorizer currently supports
2060                  a single vector size, see the reference to
2061                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2062                  vectorization factor is computed.  */
2063               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2064                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2065               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2066               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2067 		      DR_STMT (dr));
2068             }
2069         }
2070 
2071       /* Versioning requires at least one misaligned data reference.  */
2072       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2073         do_versioning = false;
2074       else if (!do_versioning)
2075         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2076     }
2077 
2078   if (do_versioning)
2079     {
2080       vec<gimple *> may_misalign_stmts
2081         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2082       gimple *stmt;
2083 
2084       /* It can now be assumed that the data references in the statements
2085          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2086          of the loop being vectorized.  */
2087       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2088         {
2089           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2090           dr = STMT_VINFO_DATA_REF (stmt_info);
2091 	  SET_DR_MISALIGNMENT (dr, 0);
2092 	  if (dump_enabled_p ())
2093             dump_printf_loc (MSG_NOTE, vect_location,
2094                              "Alignment of access forced using versioning.\n");
2095         }
2096 
2097       if (dump_enabled_p ())
2098         dump_printf_loc (MSG_NOTE, vect_location,
2099                          "Versioning for alignment will be applied.\n");
2100 
2101       /* Peeling and versioning can't be done together at this time.  */
2102       gcc_assert (! (do_peeling && do_versioning));
2103 
2104       stat = vect_verify_datarefs_alignment (loop_vinfo);
2105       gcc_assert (stat);
2106       return stat;
2107     }
2108 
2109   /* This point is reached if neither peeling nor versioning is being done.  */
2110   gcc_assert (! (do_peeling || do_versioning));
2111 
2112   stat = vect_verify_datarefs_alignment (loop_vinfo);
2113   return stat;
2114 }
2115 
2116 
2117 /* Function vect_find_same_alignment_drs.
2118 
2119    Update group and alignment relations according to the chosen
2120    vectorization factor.  */
2121 
2122 static void
2123 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2124 			      loop_vec_info loop_vinfo)
2125 {
2126   unsigned int i;
2127   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2128   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2129   struct data_reference *dra = DDR_A (ddr);
2130   struct data_reference *drb = DDR_B (ddr);
2131   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2132   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2133   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2134   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2135   lambda_vector dist_v;
2136   unsigned int loop_depth;
2137 
2138   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2139     return;
2140 
2141   if (dra == drb)
2142     return;
2143 
2144   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2145     return;
2146 
2147   /* Loop-based vectorization and known data dependence.  */
2148   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2149     return;
2150 
2151   /* Data-dependence analysis reports a distance vector of zero
2152      for data-references that overlap only in the first iteration
2153      but have different sign step (see PR45764).
2154      So as a sanity check require equal DR_STEP.  */
2155   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2156     return;
2157 
2158   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2159   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2160     {
2161       int dist = dist_v[loop_depth];
2162 
2163       if (dump_enabled_p ())
2164 	dump_printf_loc (MSG_NOTE, vect_location,
2165 	                 "dependence distance  = %d.\n", dist);
2166 
2167       /* Same loop iteration.  */
2168       if (dist == 0
2169 	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
2170 	{
2171 	  /* Two references with distance zero have the same alignment.  */
2172 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2173 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2174 	  if (dump_enabled_p ())
2175 	    {
2176 	      dump_printf_loc (MSG_NOTE, vect_location,
2177 	                       "accesses have the same alignment.\n");
2178 	      dump_printf (MSG_NOTE,
2179 	                   "dependence distance modulo vf == 0 between ");
2180 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2181 	      dump_printf (MSG_NOTE,  " and ");
2182 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2183 	      dump_printf (MSG_NOTE, "\n");
2184 	    }
2185 	}
2186     }
2187 }
2188 
2189 
2190 /* Function vect_analyze_data_refs_alignment
2191 
2192    Analyze the alignment of the data-references in the loop.
2193    Return FALSE if a data reference is found that cannot be vectorized.  */
2194 
2195 bool
2196 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2197 {
2198   if (dump_enabled_p ())
2199     dump_printf_loc (MSG_NOTE, vect_location,
2200                      "=== vect_analyze_data_refs_alignment ===\n");
2201 
2202   /* Mark groups of data references with same alignment using
2203      data dependence information.  */
2204   vec<ddr_p> ddrs = vinfo->ddrs;
2205   struct data_dependence_relation *ddr;
2206   unsigned int i;
2207 
2208   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2209     vect_find_same_alignment_drs (ddr, vinfo);
2210 
2211   vec<data_reference_p> datarefs = vinfo->datarefs;
2212   struct data_reference *dr;
2213 
2214   FOR_EACH_VEC_ELT (datarefs, i, dr)
2215     {
2216       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2217       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2218 	  && !vect_compute_data_ref_alignment (dr))
2219 	{
2220 	  /* Strided accesses perform only component accesses, misalignment
2221 	     information is irrelevant for them.  */
2222 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2223 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2224 	    continue;
2225 
2226 	  if (dump_enabled_p ())
2227 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228 			     "not vectorized: can't calculate alignment "
2229 			     "for data ref.\n");
2230 
2231 	  return false;
2232 	}
2233     }
2234 
2235   return true;
2236 }
2237 
2238 
2239 /* Analyze alignment of DRs of stmts in NODE.  */
2240 
2241 static bool
2242 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2243 {
2244   /* We vectorize from the first scalar stmt in the node unless
2245      the node is permuted in which case we start from the first
2246      element in the group.  */
2247   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2248   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2249   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2250     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2251 
2252   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2253   if (! vect_compute_data_ref_alignment (dr)
2254       /* For creating the data-ref pointer we need alignment of the
2255 	 first element anyway.  */
2256       || (dr != first_dr
2257 	  && ! vect_compute_data_ref_alignment (first_dr))
2258       || ! verify_data_ref_alignment (dr))
2259     {
2260       if (dump_enabled_p ())
2261 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 			 "not vectorized: bad data alignment in basic "
2263 			 "block.\n");
2264       return false;
2265     }
2266 
2267   return true;
2268 }
2269 
2270 /* Function vect_slp_analyze_instance_alignment
2271 
2272    Analyze the alignment of the data-references in the SLP instance.
2273    Return FALSE if a data reference is found that cannot be vectorized.  */
2274 
2275 bool
2276 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2277 {
2278   if (dump_enabled_p ())
2279     dump_printf_loc (MSG_NOTE, vect_location,
2280                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2281 
2282   slp_tree node;
2283   unsigned i;
2284   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2285     if (! vect_slp_analyze_and_verify_node_alignment (node))
2286       return false;
2287 
2288   node = SLP_INSTANCE_TREE (instance);
2289   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2290       && ! vect_slp_analyze_and_verify_node_alignment
2291 	     (SLP_INSTANCE_TREE (instance)))
2292     return false;
2293 
2294   return true;
2295 }
2296 
2297 
2298 /* Analyze groups of accesses: check that DR belongs to a group of
2299    accesses of legal size, step, etc.  Detect gaps, single element
2300    interleaving, and other special cases. Set grouped access info.
2301    Collect groups of strided stores for further use in SLP analysis.
2302    Worker for vect_analyze_group_access.  */
2303 
2304 static bool
2305 vect_analyze_group_access_1 (struct data_reference *dr)
2306 {
2307   tree step = DR_STEP (dr);
2308   tree scalar_type = TREE_TYPE (DR_REF (dr));
2309   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2310   gimple *stmt = DR_STMT (dr);
2311   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2312   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2313   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2314   HOST_WIDE_INT dr_step = -1;
2315   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2316   bool slp_impossible = false;
2317 
2318   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2319      size of the interleaving group (including gaps).  */
2320   if (tree_fits_shwi_p (step))
2321     {
2322       dr_step = tree_to_shwi (step);
2323       /* Check that STEP is a multiple of type size.  Otherwise there is
2324          a non-element-sized gap at the end of the group which we
2325 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2326 	 ???  As we can handle non-constant step fine here we should
2327 	 simply remove uses of GROUP_GAP between the last and first
2328 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2329 	 simply not include that gap.  */
2330       if ((dr_step % type_size) != 0)
2331 	{
2332 	  if (dump_enabled_p ())
2333 	    {
2334 	      dump_printf_loc (MSG_NOTE, vect_location,
2335 	                       "Step ");
2336 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2337 	      dump_printf (MSG_NOTE,
2338 			   " is not a multiple of the element size for ");
2339 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2340 	      dump_printf (MSG_NOTE, "\n");
2341 	    }
2342 	  return false;
2343 	}
2344       groupsize = absu_hwi (dr_step) / type_size;
2345     }
2346   else
2347     groupsize = 0;
2348 
2349   /* Not consecutive access is possible only if it is a part of interleaving.  */
2350   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2351     {
2352       /* Check if it this DR is a part of interleaving, and is a single
2353 	 element of the group that is accessed in the loop.  */
2354 
2355       /* Gaps are supported only for loads. STEP must be a multiple of the type
2356 	 size.  The size of the group must be a power of 2.  */
2357       if (DR_IS_READ (dr)
2358 	  && (dr_step % type_size) == 0
2359 	  && groupsize > 0
2360 	  && pow2p_hwi (groupsize))
2361 	{
2362 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2363 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2364 	  GROUP_GAP (stmt_info) = groupsize - 1;
2365 	  if (dump_enabled_p ())
2366 	    {
2367 	      dump_printf_loc (MSG_NOTE, vect_location,
2368 	                       "Detected single element interleaving ");
2369 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2370 	      dump_printf (MSG_NOTE, " step ");
2371 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2372 	      dump_printf (MSG_NOTE, "\n");
2373 	    }
2374 
2375 	  return true;
2376 	}
2377 
2378       if (dump_enabled_p ())
2379         {
2380  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381 	                   "not consecutive access ");
2382 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2383         }
2384 
2385       if (bb_vinfo)
2386         {
2387           /* Mark the statement as unvectorizable.  */
2388           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2389           return true;
2390         }
2391 
2392       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2393       STMT_VINFO_STRIDED_P (stmt_info) = true;
2394       return true;
2395     }
2396 
2397   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2398     {
2399       /* First stmt in the interleaving chain. Check the chain.  */
2400       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2401       struct data_reference *data_ref = dr;
2402       unsigned int count = 1;
2403       tree prev_init = DR_INIT (data_ref);
2404       gimple *prev = stmt;
2405       HOST_WIDE_INT diff, gaps = 0;
2406 
2407       while (next)
2408         {
2409           /* Skip same data-refs.  In case that two or more stmts share
2410              data-ref (supported only for loads), we vectorize only the first
2411              stmt, and the rest get their vectorized loads from the first
2412              one.  */
2413           if (!tree_int_cst_compare (DR_INIT (data_ref),
2414                                      DR_INIT (STMT_VINFO_DATA_REF (
2415 						   vinfo_for_stmt (next)))))
2416             {
2417               if (DR_IS_WRITE (data_ref))
2418                 {
2419                   if (dump_enabled_p ())
2420                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2421                                      "Two store stmts share the same dr.\n");
2422                   return false;
2423                 }
2424 
2425 	      if (dump_enabled_p ())
2426 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 				 "Two or more load stmts share the same dr.\n");
2428 
2429               /* For load use the same data-ref load.  */
2430               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2431 
2432               prev = next;
2433               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2434               continue;
2435             }
2436 
2437           prev = next;
2438           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2439 
2440 	  /* All group members have the same STEP by construction.  */
2441 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2442 
2443           /* Check that the distance between two accesses is equal to the type
2444              size. Otherwise, we have gaps.  */
2445           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2446                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2447 	  if (diff != 1)
2448 	    {
2449 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2450 	      slp_impossible = true;
2451 	      if (DR_IS_WRITE (data_ref))
2452 		{
2453                   if (dump_enabled_p ())
2454                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455                                      "interleaved store with gaps\n");
2456 		  return false;
2457 		}
2458 
2459               gaps += diff - 1;
2460 	    }
2461 
2462 	  last_accessed_element += diff;
2463 
2464           /* Store the gap from the previous member of the group. If there is no
2465              gap in the access, GROUP_GAP is always 1.  */
2466           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2467 
2468           prev_init = DR_INIT (data_ref);
2469           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2470           /* Count the number of data-refs in the chain.  */
2471           count++;
2472         }
2473 
2474       if (groupsize == 0)
2475         groupsize = count + gaps;
2476 
2477       /* This could be UINT_MAX but as we are generating code in a very
2478          inefficient way we have to cap earlier.  See PR78699 for example.  */
2479       if (groupsize > 4096)
2480 	{
2481 	  if (dump_enabled_p ())
2482 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2483 			     "group is too large\n");
2484 	  return false;
2485 	}
2486 
2487       /* Check that the size of the interleaving is equal to count for stores,
2488          i.e., that there are no gaps.  */
2489       if (groupsize != count
2490 	  && !DR_IS_READ (dr))
2491         {
2492 	  if (dump_enabled_p ())
2493 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 			     "interleaved store with gaps\n");
2495 	  return false;
2496 	}
2497 
2498       /* If there is a gap after the last load in the group it is the
2499 	 difference between the groupsize and the last accessed
2500 	 element.
2501 	 When there is no gap, this difference should be 0.  */
2502       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2503 
2504       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2505       if (dump_enabled_p ())
2506 	{
2507 	  dump_printf_loc (MSG_NOTE, vect_location,
2508 			   "Detected interleaving ");
2509 	  if (DR_IS_READ (dr))
2510 	    dump_printf (MSG_NOTE, "load ");
2511 	  else
2512 	    dump_printf (MSG_NOTE, "store ");
2513 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2514 		       (unsigned)groupsize);
2515 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2516 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2517 	    dump_printf_loc (MSG_NOTE, vect_location,
2518 			     "There is a gap of %u elements after the group\n",
2519 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2520 	}
2521 
2522       /* SLP: create an SLP data structure for every interleaving group of
2523 	 stores for further analysis in vect_analyse_slp.  */
2524       if (DR_IS_WRITE (dr) && !slp_impossible)
2525         {
2526           if (loop_vinfo)
2527             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2528           if (bb_vinfo)
2529             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2530         }
2531     }
2532 
2533   return true;
2534 }
2535 
2536 /* Analyze groups of accesses: check that DR belongs to a group of
2537    accesses of legal size, step, etc.  Detect gaps, single element
2538    interleaving, and other special cases. Set grouped access info.
2539    Collect groups of strided stores for further use in SLP analysis.  */
2540 
2541 static bool
2542 vect_analyze_group_access (struct data_reference *dr)
2543 {
2544   if (!vect_analyze_group_access_1 (dr))
2545     {
2546       /* Dissolve the group if present.  */
2547       gimple *next;
2548       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2549       while (stmt)
2550 	{
2551 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2552 	  next = GROUP_NEXT_ELEMENT (vinfo);
2553 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2554 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2555 	  stmt = next;
2556 	}
2557       return false;
2558     }
2559   return true;
2560 }
2561 
2562 /* Analyze the access pattern of the data-reference DR.
2563    In case of non-consecutive accesses call vect_analyze_group_access() to
2564    analyze groups of accesses.  */
2565 
2566 static bool
2567 vect_analyze_data_ref_access (struct data_reference *dr)
2568 {
2569   tree step = DR_STEP (dr);
2570   tree scalar_type = TREE_TYPE (DR_REF (dr));
2571   gimple *stmt = DR_STMT (dr);
2572   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2573   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2574   struct loop *loop = NULL;
2575 
2576   if (loop_vinfo)
2577     loop = LOOP_VINFO_LOOP (loop_vinfo);
2578 
2579   if (loop_vinfo && !step)
2580     {
2581       if (dump_enabled_p ())
2582 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2583 	                 "bad data-ref access in loop\n");
2584       return false;
2585     }
2586 
2587   /* Allow loads with zero step in inner-loop vectorization.  */
2588   if (loop_vinfo && integer_zerop (step))
2589     {
2590       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2591       if (!nested_in_vect_loop_p (loop, stmt))
2592 	return DR_IS_READ (dr);
2593       /* Allow references with zero step for outer loops marked
2594 	 with pragma omp simd only - it guarantees absence of
2595 	 loop-carried dependencies between inner loop iterations.  */
2596       if (!loop->force_vectorize || loop->safelen < 2)
2597 	{
2598 	  if (dump_enabled_p ())
2599 	    dump_printf_loc (MSG_NOTE, vect_location,
2600 			     "zero step in inner loop of nest\n");
2601 	  return false;
2602 	}
2603     }
2604 
2605   if (loop && nested_in_vect_loop_p (loop, stmt))
2606     {
2607       /* Interleaved accesses are not yet supported within outer-loop
2608         vectorization for references in the inner-loop.  */
2609       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2610 
2611       /* For the rest of the analysis we use the outer-loop step.  */
2612       step = STMT_VINFO_DR_STEP (stmt_info);
2613       if (integer_zerop (step))
2614 	{
2615 	  if (dump_enabled_p ())
2616 	    dump_printf_loc (MSG_NOTE, vect_location,
2617 	                     "zero step in outer loop.\n");
2618 	  return DR_IS_READ (dr);
2619 	}
2620     }
2621 
2622   /* Consecutive?  */
2623   if (TREE_CODE (step) == INTEGER_CST)
2624     {
2625       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2626       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2627 	  || (dr_step < 0
2628 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2629 	{
2630 	  /* Mark that it is not interleaving.  */
2631 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2632 	  return true;
2633 	}
2634     }
2635 
2636   if (loop && nested_in_vect_loop_p (loop, stmt))
2637     {
2638       if (dump_enabled_p ())
2639 	dump_printf_loc (MSG_NOTE, vect_location,
2640 	                 "grouped access in outer loop.\n");
2641       return false;
2642     }
2643 
2644 
2645   /* Assume this is a DR handled by non-constant strided load case.  */
2646   if (TREE_CODE (step) != INTEGER_CST)
2647     return (STMT_VINFO_STRIDED_P (stmt_info)
2648 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2649 		|| vect_analyze_group_access (dr)));
2650 
2651   /* Not consecutive access - check if it's a part of interleaving group.  */
2652   return vect_analyze_group_access (dr);
2653 }
2654 
2655 
2656 
2657 /*  A helper function used in the comparator function to sort data
2658     references.  T1 and T2 are two data references to be compared.
2659     The function returns -1, 0, or 1.  */
2660 
2661 static int
2662 compare_tree (tree t1, tree t2)
2663 {
2664   int i, cmp;
2665   enum tree_code code;
2666   char tclass;
2667 
2668   if (t1 == t2)
2669     return 0;
2670   if (t1 == NULL)
2671     return -1;
2672   if (t2 == NULL)
2673     return 1;
2674 
2675   STRIP_NOPS (t1);
2676   STRIP_NOPS (t2);
2677 
2678   if (TREE_CODE (t1) != TREE_CODE (t2))
2679     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2680 
2681   code = TREE_CODE (t1);
2682   switch (code)
2683     {
2684     /* For const values, we can just use hash values for comparisons.  */
2685     case INTEGER_CST:
2686     case REAL_CST:
2687     case FIXED_CST:
2688     case STRING_CST:
2689     case COMPLEX_CST:
2690     case VECTOR_CST:
2691       {
2692 	hashval_t h1 = iterative_hash_expr (t1, 0);
2693 	hashval_t h2 = iterative_hash_expr (t2, 0);
2694 	if (h1 != h2)
2695 	  return h1 < h2 ? -1 : 1;
2696 	break;
2697       }
2698 
2699     case SSA_NAME:
2700       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2701       if (cmp != 0)
2702 	return cmp;
2703 
2704       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2705 	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2706       break;
2707 
2708     default:
2709       tclass = TREE_CODE_CLASS (code);
2710 
2711       /* For var-decl, we could compare their UIDs.  */
2712       if (tclass == tcc_declaration)
2713 	{
2714 	  if (DECL_UID (t1) != DECL_UID (t2))
2715 	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2716 	  break;
2717 	}
2718 
2719       /* For expressions with operands, compare their operands recursively.  */
2720       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2721 	{
2722 	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2723 	  if (cmp != 0)
2724 	    return cmp;
2725 	}
2726     }
2727 
2728   return 0;
2729 }
2730 
2731 
2732 /* Compare two data-references DRA and DRB to group them into chunks
2733    suitable for grouping.  */
2734 
2735 static int
2736 dr_group_sort_cmp (const void *dra_, const void *drb_)
2737 {
2738   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2739   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2740   int cmp;
2741 
2742   /* Stabilize sort.  */
2743   if (dra == drb)
2744     return 0;
2745 
2746   /* DRs in different loops never belong to the same group.  */
2747   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2748   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2749   if (loopa != loopb)
2750     return loopa->num < loopb->num ? -1 : 1;
2751 
2752   /* Ordering of DRs according to base.  */
2753   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2754     {
2755       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2756       if (cmp != 0)
2757         return cmp;
2758     }
2759 
2760   /* And according to DR_OFFSET.  */
2761   if (!dr_equal_offsets_p (dra, drb))
2762     {
2763       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2764       if (cmp != 0)
2765         return cmp;
2766     }
2767 
2768   /* Put reads before writes.  */
2769   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2770     return DR_IS_READ (dra) ? -1 : 1;
2771 
2772   /* Then sort after access size.  */
2773   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2774 			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2775     {
2776       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2777                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2778       if (cmp != 0)
2779         return cmp;
2780     }
2781 
2782   /* And after step.  */
2783   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2784     {
2785       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2786       if (cmp != 0)
2787         return cmp;
2788     }
2789 
2790   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2791   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2792   if (cmp == 0)
2793     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2794   return cmp;
2795 }
2796 
2797 /* Function vect_analyze_data_ref_accesses.
2798 
2799    Analyze the access pattern of all the data references in the loop.
2800 
2801    FORNOW: the only access pattern that is considered vectorizable is a
2802 	   simple step 1 (consecutive) access.
2803 
2804    FORNOW: handle only arrays and pointer accesses.  */
2805 
2806 bool
2807 vect_analyze_data_ref_accesses (vec_info *vinfo)
2808 {
2809   unsigned int i;
2810   vec<data_reference_p> datarefs = vinfo->datarefs;
2811   struct data_reference *dr;
2812 
2813   if (dump_enabled_p ())
2814     dump_printf_loc (MSG_NOTE, vect_location,
2815                      "=== vect_analyze_data_ref_accesses ===\n");
2816 
2817   if (datarefs.is_empty ())
2818     return true;
2819 
2820   /* Sort the array of datarefs to make building the interleaving chains
2821      linear.  Don't modify the original vector's order, it is needed for
2822      determining what dependencies are reversed.  */
2823   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2824   datarefs_copy.qsort (dr_group_sort_cmp);
2825 
2826   /* Build the interleaving chains.  */
2827   for (i = 0; i < datarefs_copy.length () - 1;)
2828     {
2829       data_reference_p dra = datarefs_copy[i];
2830       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2831       stmt_vec_info lastinfo = NULL;
2832       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2833 	{
2834 	  ++i;
2835 	  continue;
2836 	}
2837       for (i = i + 1; i < datarefs_copy.length (); ++i)
2838 	{
2839 	  data_reference_p drb = datarefs_copy[i];
2840 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2841 	  if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2842 	    break;
2843 
2844 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2845 	     accesses, same accesses) can lead to a group to be artificially
2846 	     split here as we don't just skip over those.  If it really
2847 	     matters we can push those to a worklist and re-iterate
2848 	     over them.  The we can just skip ahead to the next DR here.  */
2849 
2850 	  /* DRs in a different loop should not be put into the same
2851 	     interleaving group.  */
2852 	  if (gimple_bb (DR_STMT (dra))->loop_father
2853 	      != gimple_bb (DR_STMT (drb))->loop_father)
2854 	    break;
2855 
2856 	  /* Check that the data-refs have same first location (except init)
2857 	     and they are both either store or load (not load and store,
2858 	     not masked loads or stores).  */
2859 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2860 	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2861 				   DR_BASE_ADDRESS (drb), 0)
2862 	      || !dr_equal_offsets_p (dra, drb)
2863 	      || !gimple_assign_single_p (DR_STMT (dra))
2864 	      || !gimple_assign_single_p (DR_STMT (drb)))
2865 	    break;
2866 
2867 	  /* Check that the data-refs have the same constant size.  */
2868 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2869 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2870 	  if (!tree_fits_uhwi_p (sza)
2871 	      || !tree_fits_uhwi_p (szb)
2872 	      || !tree_int_cst_equal (sza, szb))
2873 	    break;
2874 
2875 	  /* Check that the data-refs have the same step.  */
2876 	  if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2877 	    break;
2878 
2879 	  /* Do not place the same access in the interleaving chain twice.  */
2880 	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2881 	    break;
2882 
2883 	  /* Check the types are compatible.
2884 	     ???  We don't distinguish this during sorting.  */
2885 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2886 				   TREE_TYPE (DR_REF (drb))))
2887 	    break;
2888 
2889 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2890 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2891 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2892 	  gcc_assert (init_a <= init_b);
2893 
2894 	  /* If init_b == init_a + the size of the type * k, we have an
2895 	     interleaving, and DRA is accessed before DRB.  */
2896 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2897 	  if (type_size_a == 0
2898 	      || (init_b - init_a) % type_size_a != 0)
2899 	    break;
2900 
2901 	  /* If we have a store, the accesses are adjacent.  This splits
2902 	     groups into chunks we support (we don't support vectorization
2903 	     of stores with gaps).  */
2904 	  if (!DR_IS_READ (dra)
2905 	      && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2906 					     (DR_INIT (datarefs_copy[i-1]))
2907 		  != type_size_a))
2908 	    break;
2909 
2910 	  /* If the step (if not zero or non-constant) is greater than the
2911 	     difference between data-refs' inits this splits groups into
2912 	     suitable sizes.  */
2913 	  if (tree_fits_shwi_p (DR_STEP (dra)))
2914 	    {
2915 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2916 	      if (step != 0 && step <= (init_b - init_a))
2917 		break;
2918 	    }
2919 
2920 	  if (dump_enabled_p ())
2921 	    {
2922 	      dump_printf_loc (MSG_NOTE, vect_location,
2923 			       "Detected interleaving ");
2924 	      if (DR_IS_READ (dra))
2925 		dump_printf (MSG_NOTE, "load ");
2926 	      else
2927 		dump_printf (MSG_NOTE, "store ");
2928 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2929 	      dump_printf (MSG_NOTE,  " and ");
2930 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2931 	      dump_printf (MSG_NOTE, "\n");
2932 	    }
2933 
2934 	  /* Link the found element into the group list.  */
2935 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2936 	    {
2937 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2938 	      lastinfo = stmtinfo_a;
2939 	    }
2940 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2941 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2942 	  lastinfo = stmtinfo_b;
2943 	}
2944     }
2945 
2946   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2947     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2948         && !vect_analyze_data_ref_access (dr))
2949       {
2950 	if (dump_enabled_p ())
2951 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2952 	                   "not vectorized: complicated access pattern.\n");
2953 
2954         if (is_a <bb_vec_info> (vinfo))
2955           {
2956             /* Mark the statement as not vectorizable.  */
2957             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2958             continue;
2959           }
2960         else
2961 	  {
2962 	    datarefs_copy.release ();
2963 	    return false;
2964 	  }
2965       }
2966 
2967   datarefs_copy.release ();
2968   return true;
2969 }
2970 
2971 
2972 /* Operator == between two dr_with_seg_len objects.
2973 
2974    This equality operator is used to make sure two data refs
2975    are the same one so that we will consider to combine the
2976    aliasing checks of those two pairs of data dependent data
2977    refs.  */
2978 
2979 static bool
2980 operator == (const dr_with_seg_len& d1,
2981 	     const dr_with_seg_len& d2)
2982 {
2983   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2984 			  DR_BASE_ADDRESS (d2.dr), 0)
2985 	   && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2986 	   && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2987 	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2988 }
2989 
2990 /* Function comp_dr_with_seg_len_pair.
2991 
2992    Comparison function for sorting objects of dr_with_seg_len_pair_t
2993    so that we can combine aliasing checks in one scan.  */
2994 
2995 static int
2996 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2997 {
2998   const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2999   const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
3000   const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
3001   const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
3002 
3003   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
3004      if a and c have the same basic address snd step, and b and d have the same
3005      address and step.  Therefore, if any a&c or b&d don't have the same address
3006      and step, we don't care the order of those two pairs after sorting.  */
3007   int comp_res;
3008 
3009   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
3010 				DR_BASE_ADDRESS (b1.dr))) != 0)
3011     return comp_res;
3012   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
3013 				DR_BASE_ADDRESS (b2.dr))) != 0)
3014     return comp_res;
3015   if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
3016     return comp_res;
3017   if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
3018     return comp_res;
3019   if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
3020     return comp_res;
3021   if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
3022     return comp_res;
3023   if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
3024     return comp_res;
3025   if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
3026     return comp_res;
3027 
3028   return 0;
3029 }
3030 
3031 /* Function vect_vfa_segment_size.
3032 
3033    Create an expression that computes the size of segment
3034    that will be accessed for a data reference.  The functions takes into
3035    account that realignment loads may access one more vector.
3036 
3037    Input:
3038      DR: The data reference.
3039      LENGTH_FACTOR: segment length to consider.
3040 
3041    Return an expression whose value is the size of segment which will be
3042    accessed by DR.  */
3043 
3044 static tree
3045 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3046 {
3047   tree segment_length;
3048 
3049   if (integer_zerop (DR_STEP (dr)))
3050     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3051   else
3052     segment_length = size_binop (MULT_EXPR,
3053 				 fold_convert (sizetype, DR_STEP (dr)),
3054 				 fold_convert (sizetype, length_factor));
3055 
3056   if (vect_supportable_dr_alignment (dr, false)
3057 	== dr_explicit_realign_optimized)
3058     {
3059       tree vector_size = TYPE_SIZE_UNIT
3060 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3061 
3062       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3063     }
3064   return segment_length;
3065 }
3066 
3067 /* Function vect_no_alias_p.
3068 
3069    Given data references A and B with equal base and offset, the alias
3070    relation can be decided at compilation time, return TRUE if they do
3071    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3072    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3073    respectively.  */
3074 
3075 static bool
3076 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3077                  tree segment_length_a, tree segment_length_b)
3078 {
3079   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3080 	      && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3081   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3082     return false;
3083 
3084   tree seg_a_min = DR_INIT (a);
3085   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3086 				seg_a_min, segment_length_a);
3087   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3088      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3089      [a, a+12) */
3090   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3091     {
3092       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3093       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3094 			       seg_a_max, unit_size);
3095       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3096 			       DR_INIT (a), unit_size);
3097     }
3098   tree seg_b_min = DR_INIT (b);
3099   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3100 				seg_b_min, segment_length_b);
3101   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3102     {
3103       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3104       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3105 			       seg_b_max, unit_size);
3106       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3107 			       DR_INIT (b), unit_size);
3108     }
3109 
3110   if (tree_int_cst_le (seg_a_max, seg_b_min)
3111       || tree_int_cst_le (seg_b_max, seg_a_min))
3112     return true;
3113 
3114   return false;
3115 }
3116 
3117 /* Function vect_prune_runtime_alias_test_list.
3118 
3119    Prune a list of ddrs to be tested at run-time by versioning for alias.
3120    Merge several alias checks into one if possible.
3121    Return FALSE if resulting list of ddrs is longer then allowed by
3122    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3123 
3124 bool
3125 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3126 {
3127   vec<ddr_p> may_alias_ddrs =
3128     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3129   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3130     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3131   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3132   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3133 
3134   ddr_p ddr;
3135   unsigned int i;
3136   tree length_factor;
3137 
3138   if (dump_enabled_p ())
3139     dump_printf_loc (MSG_NOTE, vect_location,
3140                      "=== vect_prune_runtime_alias_test_list ===\n");
3141 
3142   if (may_alias_ddrs.is_empty ())
3143     return true;
3144 
3145   /* Basically, for each pair of dependent data refs store_ptr_0
3146      and load_ptr_0, we create an expression:
3147 
3148      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3149      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3150 
3151      for aliasing checks.  However, in some cases we can decrease
3152      the number of checks by combining two checks into one.  For
3153      example, suppose we have another pair of data refs store_ptr_0
3154      and load_ptr_1, and if the following condition is satisfied:
3155 
3156      load_ptr_0 < load_ptr_1  &&
3157      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3158 
3159      (this condition means, in each iteration of vectorized loop,
3160      the accessed memory of store_ptr_0 cannot be between the memory
3161      of load_ptr_0 and load_ptr_1.)
3162 
3163      we then can use only the following expression to finish the
3164      alising checks between store_ptr_0 & load_ptr_0 and
3165      store_ptr_0 & load_ptr_1:
3166 
3167      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3168      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3169 
3170      Note that we only consider that load_ptr_0 and load_ptr_1 have the
3171      same basic address.  */
3172 
3173   comp_alias_ddrs.create (may_alias_ddrs.length ());
3174 
3175   /* First, we collect all data ref pairs for aliasing checks.  */
3176   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3177     {
3178       int comp_res;
3179       struct data_reference *dr_a, *dr_b;
3180       gimple *dr_group_first_a, *dr_group_first_b;
3181       tree segment_length_a, segment_length_b;
3182       gimple *stmt_a, *stmt_b;
3183 
3184       dr_a = DDR_A (ddr);
3185       stmt_a = DR_STMT (DDR_A (ddr));
3186       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3187       if (dr_group_first_a)
3188 	{
3189 	  stmt_a = dr_group_first_a;
3190 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3191 	}
3192 
3193       dr_b = DDR_B (ddr);
3194       stmt_b = DR_STMT (DDR_B (ddr));
3195       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3196       if (dr_group_first_b)
3197 	{
3198 	  stmt_b = dr_group_first_b;
3199 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3200 	}
3201 
3202       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3203 	length_factor = scalar_loop_iters;
3204       else
3205 	length_factor = size_int (vect_factor);
3206       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3207       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3208 
3209       comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3210       if (comp_res == 0)
3211 	comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3212 
3213       /* Alias is known at compilation time.  */
3214       if (comp_res == 0
3215 	  && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3216 	  && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3217 	  && TREE_CODE (segment_length_a) == INTEGER_CST
3218 	  && TREE_CODE (segment_length_b) == INTEGER_CST)
3219 	{
3220 	  if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3221 	    continue;
3222 
3223 	  if (dump_enabled_p ())
3224 	    dump_printf_loc (MSG_NOTE, vect_location,
3225 			     "not vectorized: compilation time alias.\n");
3226 
3227 	  return false;
3228 	}
3229 
3230       dr_with_seg_len_pair_t dr_with_seg_len_pair
3231 	  (dr_with_seg_len (dr_a, segment_length_a),
3232 	   dr_with_seg_len (dr_b, segment_length_b));
3233 
3234       /* Canonicalize pairs by sorting the two DR members.  */
3235       if (comp_res > 0)
3236 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3237 
3238       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3239     }
3240 
3241   /* Second, we sort the collected data ref pairs so that we can scan
3242      them once to combine all possible aliasing checks.  */
3243   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3244 
3245   /* Third, we scan the sorted dr pairs and check if we can combine
3246      alias checks of two neighboring dr pairs.  */
3247   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3248     {
3249       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
3250       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3251 		      *dr_b1 = &comp_alias_ddrs[i-1].second,
3252 		      *dr_a2 = &comp_alias_ddrs[i].first,
3253 		      *dr_b2 = &comp_alias_ddrs[i].second;
3254 
3255       /* Remove duplicate data ref pairs.  */
3256       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3257 	{
3258 	  if (dump_enabled_p ())
3259 	    {
3260 	      dump_printf_loc (MSG_NOTE, vect_location,
3261 			       "found equal ranges ");
3262 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3263 				 DR_REF (dr_a1->dr));
3264 	      dump_printf (MSG_NOTE,  ", ");
3265 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3266 				 DR_REF (dr_b1->dr));
3267 	      dump_printf (MSG_NOTE,  " and ");
3268 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3269 				 DR_REF (dr_a2->dr));
3270 	      dump_printf (MSG_NOTE,  ", ");
3271 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3272 				 DR_REF (dr_b2->dr));
3273 	      dump_printf (MSG_NOTE, "\n");
3274 	    }
3275 
3276 	  comp_alias_ddrs.ordered_remove (i--);
3277 	  continue;
3278 	}
3279 
3280       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3281 	{
3282 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3283 	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
3284 	  if (*dr_a1 == *dr_a2)
3285 	    {
3286 	      std::swap (dr_a1, dr_b1);
3287 	      std::swap (dr_a2, dr_b2);
3288 	    }
3289 
3290 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3291 				DR_BASE_ADDRESS (dr_a2->dr), 0)
3292 	      || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3293 				   DR_OFFSET (dr_a2->dr), 0)
3294 	      || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3295 	      || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3296 	    continue;
3297 
3298 	  /* Make sure dr_a1 starts left of dr_a2.  */
3299 	  if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3300 	    std::swap (*dr_a1, *dr_a2);
3301 
3302 	  bool do_remove = false;
3303 	  unsigned HOST_WIDE_INT diff
3304 	    = (tree_to_shwi (DR_INIT (dr_a2->dr))
3305                - tree_to_shwi (DR_INIT (dr_a1->dr)));
3306 
3307 	  /* If the left segment does not extend beyond the start of the
3308 	     right segment the new segment length is that of the right
3309 	     plus the segment distance.  */
3310 	  if (tree_fits_uhwi_p (dr_a1->seg_len)
3311 	      && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3312 	    {
3313 	      dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3314 					   size_int (diff));
3315 	      do_remove = true;
3316 	    }
3317 	  /* Generally the new segment length is the maximum of the
3318 	     left segment size and the right segment size plus the distance.
3319 	     ???  We can also build tree MAX_EXPR here but it's not clear this
3320 	     is profitable.  */
3321 	  else if (tree_fits_uhwi_p (dr_a1->seg_len)
3322 		   && tree_fits_uhwi_p (dr_a2->seg_len))
3323 	    {
3324 	      unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3325 	      unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3326 	      dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3327 	      do_remove = true;
3328 	    }
3329 	  /* Now we check if the following condition is satisfied:
3330 
3331 	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3332 
3333 	     where DIFF = DR_A2_INIT - DR_A1_INIT.  However,
3334 	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3335 	     have to make a best estimation.  We can get the minimum value
3336 	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3337 	     then either of the following two conditions can guarantee the
3338 	     one above:
3339 
3340 	     1: DIFF <= MIN_SEG_LEN_B
3341 	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
3342 	  else
3343 	    {
3344 	      unsigned HOST_WIDE_INT min_seg_len_b
3345 		= (tree_fits_uhwi_p (dr_b1->seg_len)
3346 		   ? tree_to_uhwi (dr_b1->seg_len)
3347 		   : vect_factor);
3348 
3349 	      if (diff <= min_seg_len_b
3350 		  || (tree_fits_uhwi_p (dr_a1->seg_len)
3351 		      && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3352 		{
3353 		  dr_a1->seg_len = size_binop (PLUS_EXPR,
3354 					       dr_a2->seg_len, size_int (diff));
3355 		  do_remove = true;
3356 		}
3357 	    }
3358 
3359 	  if (do_remove)
3360 	    {
3361 	      if (dump_enabled_p ())
3362 		{
3363 		  dump_printf_loc (MSG_NOTE, vect_location,
3364 				   "merging ranges for ");
3365 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3366 		  dump_printf (MSG_NOTE,  ", ");
3367 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3368 		  dump_printf (MSG_NOTE,  " and ");
3369 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3370 		  dump_printf (MSG_NOTE,  ", ");
3371 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3372 		  dump_printf (MSG_NOTE, "\n");
3373 		}
3374 	      comp_alias_ddrs.ordered_remove (i--);
3375 	    }
3376 	}
3377     }
3378 
3379   dump_printf_loc (MSG_NOTE, vect_location,
3380 		   "improved number of alias checks from %d to %d\n",
3381 		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
3382   if ((int) comp_alias_ddrs.length () >
3383       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3384     {
3385       if (dump_enabled_p ())
3386 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3387 			 "number of versioning for alias "
3388 			 "run-time tests exceeds %d "
3389 			 "(--param vect-max-version-for-alias-checks)\n",
3390 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3391       return false;
3392     }
3393 
3394   /* All alias checks have been resolved at compilation time.  */
3395   if (!comp_alias_ddrs.length ())
3396     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3397 
3398   return true;
3399 }
3400 
3401 /* Return true if a non-affine read or write in STMT is suitable for a
3402    gather load or scatter store.  Describe the operation in *INFO if so.  */
3403 
3404 bool
3405 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3406 			   gather_scatter_info *info)
3407 {
3408   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3409   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3410   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3411   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3412   tree offtype = NULL_TREE;
3413   tree decl, base, off;
3414   machine_mode pmode;
3415   int punsignedp, reversep, pvolatilep = 0;
3416 
3417   base = DR_REF (dr);
3418   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3419      see if we can use the def stmt of the address.  */
3420   if (is_gimple_call (stmt)
3421       && gimple_call_internal_p (stmt)
3422       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3423 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3424       && TREE_CODE (base) == MEM_REF
3425       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3426       && integer_zerop (TREE_OPERAND (base, 1))
3427       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3428     {
3429       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3430       if (is_gimple_assign (def_stmt)
3431 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3432 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3433     }
3434 
3435   /* The gather and scatter builtins need address of the form
3436      loop_invariant + vector * {1, 2, 4, 8}
3437      or
3438      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3439      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3440      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3441      multiplications and additions in it.  To get a vector, we need
3442      a single SSA_NAME that will be defined in the loop and will
3443      contain everything that is not loop invariant and that can be
3444      vectorized.  The following code attempts to find such a preexistng
3445      SSA_NAME OFF and put the loop invariants into a tree BASE
3446      that can be gimplified before the loop.  */
3447   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3448 			      &punsignedp, &reversep, &pvolatilep);
3449   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3450 
3451   if (TREE_CODE (base) == MEM_REF)
3452     {
3453       if (!integer_zerop (TREE_OPERAND (base, 1)))
3454 	{
3455 	  if (off == NULL_TREE)
3456 	    {
3457 	      offset_int moff = mem_ref_offset (base);
3458 	      off = wide_int_to_tree (sizetype, moff);
3459 	    }
3460 	  else
3461 	    off = size_binop (PLUS_EXPR, off,
3462 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3463 	}
3464       base = TREE_OPERAND (base, 0);
3465     }
3466   else
3467     base = build_fold_addr_expr (base);
3468 
3469   if (off == NULL_TREE)
3470     off = size_zero_node;
3471 
3472   /* If base is not loop invariant, either off is 0, then we start with just
3473      the constant offset in the loop invariant BASE and continue with base
3474      as OFF, otherwise give up.
3475      We could handle that case by gimplifying the addition of base + off
3476      into some SSA_NAME and use that as off, but for now punt.  */
3477   if (!expr_invariant_in_loop_p (loop, base))
3478     {
3479       if (!integer_zerop (off))
3480 	return false;
3481       off = base;
3482       base = size_int (pbitpos / BITS_PER_UNIT);
3483     }
3484   /* Otherwise put base + constant offset into the loop invariant BASE
3485      and continue with OFF.  */
3486   else
3487     {
3488       base = fold_convert (sizetype, base);
3489       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3490     }
3491 
3492   /* OFF at this point may be either a SSA_NAME or some tree expression
3493      from get_inner_reference.  Try to peel off loop invariants from it
3494      into BASE as long as possible.  */
3495   STRIP_NOPS (off);
3496   while (offtype == NULL_TREE)
3497     {
3498       enum tree_code code;
3499       tree op0, op1, add = NULL_TREE;
3500 
3501       if (TREE_CODE (off) == SSA_NAME)
3502 	{
3503 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3504 
3505 	  if (expr_invariant_in_loop_p (loop, off))
3506 	    return false;
3507 
3508 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3509 	    break;
3510 
3511 	  op0 = gimple_assign_rhs1 (def_stmt);
3512 	  code = gimple_assign_rhs_code (def_stmt);
3513 	  op1 = gimple_assign_rhs2 (def_stmt);
3514 	}
3515       else
3516 	{
3517 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3518 	    return false;
3519 	  code = TREE_CODE (off);
3520 	  extract_ops_from_tree (off, &code, &op0, &op1);
3521 	}
3522       switch (code)
3523 	{
3524 	case POINTER_PLUS_EXPR:
3525 	case PLUS_EXPR:
3526 	  if (expr_invariant_in_loop_p (loop, op0))
3527 	    {
3528 	      add = op0;
3529 	      off = op1;
3530 	    do_add:
3531 	      add = fold_convert (sizetype, add);
3532 	      if (scale != 1)
3533 		add = size_binop (MULT_EXPR, add, size_int (scale));
3534 	      base = size_binop (PLUS_EXPR, base, add);
3535 	      continue;
3536 	    }
3537 	  if (expr_invariant_in_loop_p (loop, op1))
3538 	    {
3539 	      add = op1;
3540 	      off = op0;
3541 	      goto do_add;
3542 	    }
3543 	  break;
3544 	case MINUS_EXPR:
3545 	  if (expr_invariant_in_loop_p (loop, op1))
3546 	    {
3547 	      add = fold_convert (sizetype, op1);
3548 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3549 	      off = op0;
3550 	      goto do_add;
3551 	    }
3552 	  break;
3553 	case MULT_EXPR:
3554 	  if (scale == 1 && tree_fits_shwi_p (op1))
3555 	    {
3556 	      scale = tree_to_shwi (op1);
3557 	      off = op0;
3558 	      continue;
3559 	    }
3560 	  break;
3561 	case SSA_NAME:
3562 	  off = op0;
3563 	  continue;
3564 	CASE_CONVERT:
3565 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3566 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3567 	    break;
3568 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3569 	      == TYPE_PRECISION (TREE_TYPE (off)))
3570 	    {
3571 	      off = op0;
3572 	      continue;
3573 	    }
3574 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3575 	      < TYPE_PRECISION (TREE_TYPE (off)))
3576 	    {
3577 	      off = op0;
3578 	      offtype = TREE_TYPE (off);
3579 	      STRIP_NOPS (off);
3580 	      continue;
3581 	    }
3582 	  break;
3583 	default:
3584 	  break;
3585 	}
3586       break;
3587     }
3588 
3589   /* If at the end OFF still isn't a SSA_NAME or isn't
3590      defined in the loop, punt.  */
3591   if (TREE_CODE (off) != SSA_NAME
3592       || expr_invariant_in_loop_p (loop, off))
3593     return false;
3594 
3595   if (offtype == NULL_TREE)
3596     offtype = TREE_TYPE (off);
3597 
3598   if (DR_IS_READ (dr))
3599     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3600 					     offtype, scale);
3601   else
3602     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3603 					      offtype, scale);
3604 
3605   if (decl == NULL_TREE)
3606     return false;
3607 
3608   info->decl = decl;
3609   info->base = base;
3610   info->offset = off;
3611   info->offset_dt = vect_unknown_def_type;
3612   info->offset_vectype = NULL_TREE;
3613   info->scale = scale;
3614   return true;
3615 }
3616 
3617 /* Function vect_analyze_data_refs.
3618 
3619   Find all the data references in the loop or basic block.
3620 
3621    The general structure of the analysis of data refs in the vectorizer is as
3622    follows:
3623    1- vect_analyze_data_refs(loop/bb): call
3624       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3625       in the loop/bb and their dependences.
3626    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3627    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3628    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3629 
3630 */
3631 
3632 bool
3633 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3634 {
3635   struct loop *loop = NULL;
3636   unsigned int i;
3637   struct data_reference *dr;
3638   tree scalar_type;
3639 
3640   if (dump_enabled_p ())
3641     dump_printf_loc (MSG_NOTE, vect_location,
3642 		     "=== vect_analyze_data_refs ===\n");
3643 
3644   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3645     loop = LOOP_VINFO_LOOP (loop_vinfo);
3646 
3647   /* Go through the data-refs, check that the analysis succeeded.  Update
3648      pointer from stmt_vec_info struct to DR and vectype.  */
3649 
3650   vec<data_reference_p> datarefs = vinfo->datarefs;
3651   FOR_EACH_VEC_ELT (datarefs, i, dr)
3652     {
3653       gimple *stmt;
3654       stmt_vec_info stmt_info;
3655       tree base, offset, init;
3656       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3657       bool simd_lane_access = false;
3658       int vf;
3659 
3660 again:
3661       if (!dr || !DR_REF (dr))
3662         {
3663           if (dump_enabled_p ())
3664 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665 	                     "not vectorized: unhandled data-ref\n");
3666           return false;
3667         }
3668 
3669       stmt = DR_STMT (dr);
3670       stmt_info = vinfo_for_stmt (stmt);
3671 
3672       /* Discard clobbers from the dataref vector.  We will remove
3673          clobber stmts during vectorization.  */
3674       if (gimple_clobber_p (stmt))
3675 	{
3676 	  free_data_ref (dr);
3677 	  if (i == datarefs.length () - 1)
3678 	    {
3679 	      datarefs.pop ();
3680 	      break;
3681 	    }
3682 	  datarefs.ordered_remove (i);
3683 	  dr = datarefs[i];
3684 	  goto again;
3685 	}
3686 
3687       /* Check that analysis of the data-ref succeeded.  */
3688       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3689 	  || !DR_STEP (dr))
3690         {
3691 	  bool maybe_gather
3692 	    = DR_IS_READ (dr)
3693 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3694 	      && targetm.vectorize.builtin_gather != NULL;
3695 	  bool maybe_scatter
3696 	    = DR_IS_WRITE (dr)
3697 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3698 	      && targetm.vectorize.builtin_scatter != NULL;
3699 	  bool maybe_simd_lane_access
3700 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
3701 
3702 	  /* If target supports vector gather loads or scatter stores, or if
3703 	     this might be a SIMD lane access, see if they can't be used.  */
3704 	  if (is_a <loop_vec_info> (vinfo)
3705 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3706 	      && !nested_in_vect_loop_p (loop, stmt))
3707 	    {
3708 	      struct data_reference *newdr
3709 		= create_data_ref (NULL, loop_containing_stmt (stmt),
3710 				   DR_REF (dr), stmt, maybe_scatter ? false : true);
3711 	      gcc_assert (newdr != NULL && DR_REF (newdr));
3712 	      if (DR_BASE_ADDRESS (newdr)
3713 		  && DR_OFFSET (newdr)
3714 		  && DR_INIT (newdr)
3715 		  && DR_STEP (newdr)
3716 		  && integer_zerop (DR_STEP (newdr)))
3717 		{
3718 		  if (maybe_simd_lane_access)
3719 		    {
3720 		      tree off = DR_OFFSET (newdr);
3721 		      STRIP_NOPS (off);
3722 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3723 			  && TREE_CODE (off) == MULT_EXPR
3724 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3725 			{
3726 			  tree step = TREE_OPERAND (off, 1);
3727 			  off = TREE_OPERAND (off, 0);
3728 			  STRIP_NOPS (off);
3729 			  if (CONVERT_EXPR_P (off)
3730 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3731 									  0)))
3732 				 < TYPE_PRECISION (TREE_TYPE (off)))
3733 			    off = TREE_OPERAND (off, 0);
3734 			  if (TREE_CODE (off) == SSA_NAME)
3735 			    {
3736 			      gimple *def = SSA_NAME_DEF_STMT (off);
3737 			      tree reft = TREE_TYPE (DR_REF (newdr));
3738 			      if (is_gimple_call (def)
3739 				  && gimple_call_internal_p (def)
3740 				  && (gimple_call_internal_fn (def)
3741 				      == IFN_GOMP_SIMD_LANE))
3742 				{
3743 				  tree arg = gimple_call_arg (def, 0);
3744 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3745 				  arg = SSA_NAME_VAR (arg);
3746 				  if (arg == loop->simduid
3747 				      /* For now.  */
3748 				      && tree_int_cst_equal
3749 					   (TYPE_SIZE_UNIT (reft),
3750 					    step))
3751 				    {
3752 				      DR_OFFSET (newdr) = ssize_int (0);
3753 				      DR_STEP (newdr) = step;
3754 				      DR_ALIGNED_TO (newdr)
3755 					= size_int (BIGGEST_ALIGNMENT);
3756 				      dr = newdr;
3757 				      simd_lane_access = true;
3758 				    }
3759 				}
3760 			    }
3761 			}
3762 		    }
3763 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
3764 		    {
3765 		      dr = newdr;
3766 		      if (maybe_gather)
3767 			gatherscatter = GATHER;
3768 		      else
3769 			gatherscatter = SCATTER;
3770 		    }
3771 		}
3772 	      if (gatherscatter == SG_NONE && !simd_lane_access)
3773 		free_data_ref (newdr);
3774 	    }
3775 
3776 	  if (gatherscatter == SG_NONE && !simd_lane_access)
3777 	    {
3778 	      if (dump_enabled_p ())
3779 		{
3780 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781                                    "not vectorized: data ref analysis "
3782                                    "failed ");
3783 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3784 		}
3785 
3786 	      if (is_a <bb_vec_info> (vinfo))
3787 		break;
3788 
3789 	      return false;
3790 	    }
3791         }
3792 
3793       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3794         {
3795           if (dump_enabled_p ())
3796             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3797                              "not vectorized: base addr of dr is a "
3798                              "constant\n");
3799 
3800           if (is_a <bb_vec_info> (vinfo))
3801 	    break;
3802 
3803 	  if (gatherscatter != SG_NONE || simd_lane_access)
3804 	    free_data_ref (dr);
3805 	  return false;
3806         }
3807 
3808       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3809         {
3810           if (dump_enabled_p ())
3811             {
3812               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813                                "not vectorized: volatile type ");
3814               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3815             }
3816 
3817           if (is_a <bb_vec_info> (vinfo))
3818 	    break;
3819 
3820           return false;
3821         }
3822 
3823       if (stmt_can_throw_internal (stmt))
3824         {
3825           if (dump_enabled_p ())
3826             {
3827               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3828                                "not vectorized: statement can throw an "
3829                                "exception ");
3830               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3831             }
3832 
3833           if (is_a <bb_vec_info> (vinfo))
3834 	    break;
3835 
3836 	  if (gatherscatter != SG_NONE || simd_lane_access)
3837 	    free_data_ref (dr);
3838           return false;
3839         }
3840 
3841       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3842 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3843 	{
3844           if (dump_enabled_p ())
3845             {
3846               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3847                                "not vectorized: statement is bitfield "
3848                                "access ");
3849               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3850             }
3851 
3852           if (is_a <bb_vec_info> (vinfo))
3853 	    break;
3854 
3855 	  if (gatherscatter != SG_NONE || simd_lane_access)
3856 	    free_data_ref (dr);
3857           return false;
3858 	}
3859 
3860       base = unshare_expr (DR_BASE_ADDRESS (dr));
3861       offset = unshare_expr (DR_OFFSET (dr));
3862       init = unshare_expr (DR_INIT (dr));
3863 
3864       if (is_gimple_call (stmt)
3865 	  && (!gimple_call_internal_p (stmt)
3866 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3867 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3868 	{
3869 	  if (dump_enabled_p ())
3870 	    {
3871 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3872 	                       "not vectorized: dr in a call ");
3873 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3874 	    }
3875 
3876 	  if (is_a <bb_vec_info> (vinfo))
3877 	    break;
3878 
3879 	  if (gatherscatter != SG_NONE || simd_lane_access)
3880 	    free_data_ref (dr);
3881 	  return false;
3882 	}
3883 
3884       /* Update DR field in stmt_vec_info struct.  */
3885 
3886       /* If the dataref is in an inner-loop of the loop that is considered for
3887 	 for vectorization, we also want to analyze the access relative to
3888 	 the outer-loop (DR contains information only relative to the
3889 	 inner-most enclosing loop).  We do that by building a reference to the
3890 	 first location accessed by the inner-loop, and analyze it relative to
3891 	 the outer-loop.  */
3892       if (loop && nested_in_vect_loop_p (loop, stmt))
3893 	{
3894 	  tree outer_step, outer_base, outer_init;
3895 	  HOST_WIDE_INT pbitsize, pbitpos;
3896 	  tree poffset;
3897 	  machine_mode pmode;
3898 	  int punsignedp, preversep, pvolatilep;
3899 	  affine_iv base_iv, offset_iv;
3900 	  tree dinit;
3901 
3902 	  /* Build a reference to the first location accessed by the
3903 	     inner-loop: *(BASE+INIT).  (The first location is actually
3904 	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3905           tree inner_base = build_fold_indirect_ref
3906                                 (fold_build_pointer_plus (base, init));
3907 
3908 	  if (dump_enabled_p ())
3909 	    {
3910 	      dump_printf_loc (MSG_NOTE, vect_location,
3911                                "analyze in outer-loop: ");
3912 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3913 	      dump_printf (MSG_NOTE, "\n");
3914 	    }
3915 
3916 	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3917 					    &poffset, &pmode, &punsignedp,
3918 					    &preversep, &pvolatilep);
3919 	  gcc_assert (outer_base != NULL_TREE);
3920 
3921 	  if (pbitpos % BITS_PER_UNIT != 0)
3922 	    {
3923 	      if (dump_enabled_p ())
3924 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3925                                  "failed: bit offset alignment.\n");
3926 	      return false;
3927 	    }
3928 
3929 	  if (preversep)
3930 	    {
3931 	      if (dump_enabled_p ())
3932 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3933 				 "failed: reverse storage order.\n");
3934 	      return false;
3935 	    }
3936 
3937 	  outer_base = build_fold_addr_expr (outer_base);
3938 	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3939                           &base_iv, false))
3940 	    {
3941 	      if (dump_enabled_p ())
3942 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3943                                  "failed: evolution of base is not affine.\n");
3944 	      return false;
3945 	    }
3946 
3947 	  if (offset)
3948 	    {
3949 	      if (poffset)
3950 		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3951                                        poffset);
3952 	      else
3953 		poffset = offset;
3954 	    }
3955 
3956 	  if (!poffset)
3957 	    {
3958 	      offset_iv.base = ssize_int (0);
3959 	      offset_iv.step = ssize_int (0);
3960 	    }
3961 	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3962                                &offset_iv, false))
3963 	    {
3964 	      if (dump_enabled_p ())
3965 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3966                                  "evolution of offset is not affine.\n");
3967 	      return false;
3968 	    }
3969 
3970 	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3971 	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3972 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3973 	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3974 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3975 
3976 	  outer_step = size_binop (PLUS_EXPR,
3977 				fold_convert (ssizetype, base_iv.step),
3978 				fold_convert (ssizetype, offset_iv.step));
3979 
3980 	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3981 	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3982 	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3983 	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3984 	  STMT_VINFO_DR_OFFSET (stmt_info) =
3985 				fold_convert (ssizetype, offset_iv.base);
3986 	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3987 				size_int (highest_pow2_factor (offset_iv.base));
3988 
3989           if (dump_enabled_p ())
3990 	    {
3991 	      dump_printf_loc (MSG_NOTE, vect_location,
3992                                "\touter base_address: ");
3993 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3994                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3995 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3996 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3997                                  STMT_VINFO_DR_OFFSET (stmt_info));
3998 	      dump_printf (MSG_NOTE,
3999                            "\n\touter constant offset from base address: ");
4000 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4001                                  STMT_VINFO_DR_INIT (stmt_info));
4002 	      dump_printf (MSG_NOTE, "\n\touter step: ");
4003 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4004                                  STMT_VINFO_DR_STEP (stmt_info));
4005 	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
4006 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4007                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
4008 	      dump_printf (MSG_NOTE, "\n");
4009 	    }
4010 	}
4011 
4012       if (STMT_VINFO_DATA_REF (stmt_info))
4013         {
4014           if (dump_enabled_p ())
4015             {
4016               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4017                                "not vectorized: more than one data ref "
4018                                "in stmt: ");
4019               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4020             }
4021 
4022           if (is_a <bb_vec_info> (vinfo))
4023 	    break;
4024 
4025 	  if (gatherscatter != SG_NONE || simd_lane_access)
4026 	    free_data_ref (dr);
4027           return false;
4028         }
4029 
4030       STMT_VINFO_DATA_REF (stmt_info) = dr;
4031       if (simd_lane_access)
4032 	{
4033 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4034 	  free_data_ref (datarefs[i]);
4035 	  datarefs[i] = dr;
4036 	}
4037 
4038       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4039 	  && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4040 	  && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4041 	{
4042           if (dump_enabled_p ())
4043             {
4044               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4045                                "not vectorized: base object not addressable "
4046 			       "for stmt: ");
4047               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4048             }
4049           if (is_a <bb_vec_info> (vinfo))
4050 	    {
4051 	      /* In BB vectorization the ref can still participate
4052 	         in dependence analysis, we just can't vectorize it.  */
4053 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4054 	      continue;
4055 	    }
4056 	  return false;
4057 	}
4058 
4059       /* Set vectype for STMT.  */
4060       scalar_type = TREE_TYPE (DR_REF (dr));
4061       STMT_VINFO_VECTYPE (stmt_info)
4062 	= get_vectype_for_scalar_type (scalar_type);
4063       if (!STMT_VINFO_VECTYPE (stmt_info))
4064         {
4065           if (dump_enabled_p ())
4066             {
4067               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4068                                "not vectorized: no vectype for stmt: ");
4069               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4070               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4071               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4072                                  scalar_type);
4073               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4074             }
4075 
4076           if (is_a <bb_vec_info> (vinfo))
4077 	    {
4078 	      /* No vector type is fine, the ref can still participate
4079 	         in dependence analysis, we just can't vectorize it.  */
4080 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4081 	      continue;
4082 	    }
4083 
4084 	  if (gatherscatter != SG_NONE || simd_lane_access)
4085 	    {
4086 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4087 	      if (gatherscatter != SG_NONE)
4088 		free_data_ref (dr);
4089 	    }
4090 	  return false;
4091         }
4092       else
4093 	{
4094 	  if (dump_enabled_p ())
4095 	    {
4096 	      dump_printf_loc (MSG_NOTE, vect_location,
4097 			       "got vectype for stmt: ");
4098 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4099 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4100 				 STMT_VINFO_VECTYPE (stmt_info));
4101 	      dump_printf (MSG_NOTE, "\n");
4102 	    }
4103 	}
4104 
4105       /* Adjust the minimal vectorization factor according to the
4106 	 vector type.  */
4107       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4108       if (vf > *min_vf)
4109 	*min_vf = vf;
4110 
4111       if (gatherscatter != SG_NONE)
4112 	{
4113 	  gather_scatter_info gs_info;
4114 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4115 					  &gs_info)
4116 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4117 	    {
4118 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4119 	      free_data_ref (dr);
4120 	      if (dump_enabled_p ())
4121 		{
4122 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4123 				   (gatherscatter == GATHER) ?
4124 				   "not vectorized: not suitable for gather "
4125 				   "load " :
4126 				   "not vectorized: not suitable for scatter "
4127 				   "store ");
4128 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4129 		}
4130 	      return false;
4131 	    }
4132 
4133 	  free_data_ref (datarefs[i]);
4134 	  datarefs[i] = dr;
4135 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4136 	}
4137 
4138       else if (is_a <loop_vec_info> (vinfo)
4139 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4140 	{
4141 	  if (nested_in_vect_loop_p (loop, stmt))
4142 	    {
4143 	      if (dump_enabled_p ())
4144 		{
4145 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146                                    "not vectorized: not suitable for strided "
4147                                    "load ");
4148 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4149 		}
4150 	      return false;
4151 	    }
4152 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4153 	}
4154     }
4155 
4156   /* If we stopped analysis at the first dataref we could not analyze
4157      when trying to vectorize a basic-block mark the rest of the datarefs
4158      as not vectorizable and truncate the vector of datarefs.  That
4159      avoids spending useless time in analyzing their dependence.  */
4160   if (i != datarefs.length ())
4161     {
4162       gcc_assert (is_a <bb_vec_info> (vinfo));
4163       for (unsigned j = i; j < datarefs.length (); ++j)
4164 	{
4165 	  data_reference_p dr = datarefs[j];
4166           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4167 	  free_data_ref (dr);
4168 	}
4169       datarefs.truncate (i);
4170     }
4171 
4172   return true;
4173 }
4174 
4175 
4176 /* Function vect_get_new_vect_var.
4177 
4178    Returns a name for a new variable.  The current naming scheme appends the
4179    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4180    the name of vectorizer generated variables, and appends that to NAME if
4181    provided.  */
4182 
4183 tree
4184 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4185 {
4186   const char *prefix;
4187   tree new_vect_var;
4188 
4189   switch (var_kind)
4190   {
4191   case vect_simple_var:
4192     prefix = "vect";
4193     break;
4194   case vect_scalar_var:
4195     prefix = "stmp";
4196     break;
4197   case vect_mask_var:
4198     prefix = "mask";
4199     break;
4200   case vect_pointer_var:
4201     prefix = "vectp";
4202     break;
4203   default:
4204     gcc_unreachable ();
4205   }
4206 
4207   if (name)
4208     {
4209       char* tmp = concat (prefix, "_", name, NULL);
4210       new_vect_var = create_tmp_reg (type, tmp);
4211       free (tmp);
4212     }
4213   else
4214     new_vect_var = create_tmp_reg (type, prefix);
4215 
4216   return new_vect_var;
4217 }
4218 
4219 /* Like vect_get_new_vect_var but return an SSA name.  */
4220 
4221 tree
4222 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4223 {
4224   const char *prefix;
4225   tree new_vect_var;
4226 
4227   switch (var_kind)
4228   {
4229   case vect_simple_var:
4230     prefix = "vect";
4231     break;
4232   case vect_scalar_var:
4233     prefix = "stmp";
4234     break;
4235   case vect_pointer_var:
4236     prefix = "vectp";
4237     break;
4238   default:
4239     gcc_unreachable ();
4240   }
4241 
4242   if (name)
4243     {
4244       char* tmp = concat (prefix, "_", name, NULL);
4245       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4246       free (tmp);
4247     }
4248   else
4249     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4250 
4251   return new_vect_var;
4252 }
4253 
4254 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4255 
4256 static void
4257 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4258 				  stmt_vec_info stmt_info)
4259 {
4260   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4261   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4262   int misalign = DR_MISALIGNMENT (dr);
4263   if (misalign == -1)
4264     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4265   else
4266     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4267 }
4268 
4269 /* Function vect_create_addr_base_for_vector_ref.
4270 
4271    Create an expression that computes the address of the first memory location
4272    that will be accessed for a data reference.
4273 
4274    Input:
4275    STMT: The statement containing the data reference.
4276    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4277    OFFSET: Optional. If supplied, it is be added to the initial address.
4278    LOOP:    Specify relative to which loop-nest should the address be computed.
4279             For example, when the dataref is in an inner-loop nested in an
4280 	    outer-loop that is now being vectorized, LOOP can be either the
4281 	    outer-loop, or the inner-loop.  The first memory location accessed
4282 	    by the following dataref ('in' points to short):
4283 
4284 		for (i=0; i<N; i++)
4285 		   for (j=0; j<M; j++)
4286 		     s += in[i+j]
4287 
4288 	    is as follows:
4289 	    if LOOP=i_loop:	&in		(relative to i_loop)
4290 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4291    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4292 	    initial address.  Unlike OFFSET, which is number of elements to
4293 	    be added, BYTE_OFFSET is measured in bytes.
4294 
4295    Output:
4296    1. Return an SSA_NAME whose value is the address of the memory location of
4297       the first vector of the data reference.
4298    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4299       these statement(s) which define the returned SSA_NAME.
4300 
4301    FORNOW: We are only handling array accesses with step 1.  */
4302 
4303 tree
4304 vect_create_addr_base_for_vector_ref (gimple *stmt,
4305 				      gimple_seq *new_stmt_list,
4306 				      tree offset,
4307 				      struct loop *loop,
4308 				      tree byte_offset)
4309 {
4310   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4311   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4312   tree data_ref_base;
4313   const char *base_name;
4314   tree addr_base;
4315   tree dest;
4316   gimple_seq seq = NULL;
4317   tree base_offset;
4318   tree init;
4319   tree vect_ptr_type;
4320   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4321   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4322 
4323   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4324     {
4325       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4326 
4327       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4328 
4329       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4330       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4331       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4332     }
4333   else
4334     {
4335       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4336       base_offset = unshare_expr (DR_OFFSET (dr));
4337       init = unshare_expr (DR_INIT (dr));
4338     }
4339 
4340   if (loop_vinfo)
4341     base_name = get_name (data_ref_base);
4342   else
4343     {
4344       base_offset = ssize_int (0);
4345       init = ssize_int (0);
4346       base_name = get_name (DR_REF (dr));
4347     }
4348 
4349   /* Create base_offset */
4350   base_offset = size_binop (PLUS_EXPR,
4351 			    fold_convert (sizetype, base_offset),
4352 			    fold_convert (sizetype, init));
4353 
4354   if (offset)
4355     {
4356       offset = fold_build2 (MULT_EXPR, sizetype,
4357 			    fold_convert (sizetype, offset), step);
4358       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4359 				 base_offset, offset);
4360     }
4361   if (byte_offset)
4362     {
4363       byte_offset = fold_convert (sizetype, byte_offset);
4364       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4365 				 base_offset, byte_offset);
4366     }
4367 
4368   /* base + base_offset */
4369   if (loop_vinfo)
4370     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4371   else
4372     {
4373       addr_base = build1 (ADDR_EXPR,
4374 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4375 			  unshare_expr (DR_REF (dr)));
4376     }
4377 
4378   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4379   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4380   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4381   gimple_seq_add_seq (new_stmt_list, seq);
4382 
4383   if (DR_PTR_INFO (dr)
4384       && TREE_CODE (addr_base) == SSA_NAME
4385       && !SSA_NAME_PTR_INFO (addr_base))
4386     {
4387       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4388       if (offset || byte_offset)
4389 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4390     }
4391 
4392   if (dump_enabled_p ())
4393     {
4394       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4395       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4396       dump_printf (MSG_NOTE, "\n");
4397     }
4398 
4399   return addr_base;
4400 }
4401 
4402 
4403 /* Function vect_create_data_ref_ptr.
4404 
4405    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4406    location accessed in the loop by STMT, along with the def-use update
4407    chain to appropriately advance the pointer through the loop iterations.
4408    Also set aliasing information for the pointer.  This pointer is used by
4409    the callers to this function to create a memory reference expression for
4410    vector load/store access.
4411 
4412    Input:
4413    1. STMT: a stmt that references memory. Expected to be of the form
4414          GIMPLE_ASSIGN <name, data-ref> or
4415 	 GIMPLE_ASSIGN <data-ref, name>.
4416    2. AGGR_TYPE: the type of the reference, which should be either a vector
4417         or an array.
4418    3. AT_LOOP: the loop where the vector memref is to be created.
4419    4. OFFSET (optional): an offset to be added to the initial address accessed
4420         by the data-ref in STMT.
4421    5. BSI: location where the new stmts are to be placed if there is no loop
4422    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4423         pointing to the initial address.
4424    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4425 	to the initial address accessed by the data-ref in STMT.  This is
4426 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4427 	in bytes.
4428 
4429    Output:
4430    1. Declare a new ptr to vector_type, and have it point to the base of the
4431       data reference (initial addressed accessed by the data reference).
4432       For example, for vector of type V8HI, the following code is generated:
4433 
4434       v8hi *ap;
4435       ap = (v8hi *)initial_address;
4436 
4437       if OFFSET is not supplied:
4438          initial_address = &a[init];
4439       if OFFSET is supplied:
4440          initial_address = &a[init + OFFSET];
4441       if BYTE_OFFSET is supplied:
4442 	 initial_address = &a[init] + BYTE_OFFSET;
4443 
4444       Return the initial_address in INITIAL_ADDRESS.
4445 
4446    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4447       update the pointer in each iteration of the loop.
4448 
4449       Return the increment stmt that updates the pointer in PTR_INCR.
4450 
4451    3. Set INV_P to true if the access pattern of the data reference in the
4452       vectorized loop is invariant.  Set it to false otherwise.
4453 
4454    4. Return the pointer.  */
4455 
4456 tree
4457 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4458 			  tree offset, tree *initial_address,
4459 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4460 			  bool only_init, bool *inv_p, tree byte_offset)
4461 {
4462   const char *base_name;
4463   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4465   struct loop *loop = NULL;
4466   bool nested_in_vect_loop = false;
4467   struct loop *containing_loop = NULL;
4468   tree aggr_ptr_type;
4469   tree aggr_ptr;
4470   tree new_temp;
4471   gimple_seq new_stmt_list = NULL;
4472   edge pe = NULL;
4473   basic_block new_bb;
4474   tree aggr_ptr_init;
4475   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4476   tree aptr;
4477   gimple_stmt_iterator incr_gsi;
4478   bool insert_after;
4479   tree indx_before_incr, indx_after_incr;
4480   gimple *incr;
4481   tree step;
4482   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4483 
4484   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4485 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4486 
4487   if (loop_vinfo)
4488     {
4489       loop = LOOP_VINFO_LOOP (loop_vinfo);
4490       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4491       containing_loop = (gimple_bb (stmt))->loop_father;
4492       pe = loop_preheader_edge (loop);
4493     }
4494   else
4495     {
4496       gcc_assert (bb_vinfo);
4497       only_init = true;
4498       *ptr_incr = NULL;
4499     }
4500 
4501   /* Check the step (evolution) of the load in LOOP, and record
4502      whether it's invariant.  */
4503   if (nested_in_vect_loop)
4504     step = STMT_VINFO_DR_STEP (stmt_info);
4505   else
4506     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4507 
4508   if (integer_zerop (step))
4509     *inv_p = true;
4510   else
4511     *inv_p = false;
4512 
4513   /* Create an expression for the first address accessed by this load
4514      in LOOP.  */
4515   base_name = get_name (DR_BASE_ADDRESS (dr));
4516 
4517   if (dump_enabled_p ())
4518     {
4519       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4520       dump_printf_loc (MSG_NOTE, vect_location,
4521                        "create %s-pointer variable to type: ",
4522 		       get_tree_code_name (TREE_CODE (aggr_type)));
4523       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4524       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4525         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4526       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4527         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4528       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4529         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4530       else
4531         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4532       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4533       dump_printf (MSG_NOTE, "\n");
4534     }
4535 
4536   /* (1) Create the new aggregate-pointer variable.
4537      Vector and array types inherit the alias set of their component
4538      type by default so we need to use a ref-all pointer if the data
4539      reference does not conflict with the created aggregated data
4540      reference because it is not addressable.  */
4541   bool need_ref_all = false;
4542   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4543 			      get_alias_set (DR_REF (dr))))
4544     need_ref_all = true;
4545   /* Likewise for any of the data references in the stmt group.  */
4546   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4547     {
4548       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4549       do
4550 	{
4551 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4552 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4553 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4554 				      get_alias_set (DR_REF (sdr))))
4555 	    {
4556 	      need_ref_all = true;
4557 	      break;
4558 	    }
4559 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4560 	}
4561       while (orig_stmt);
4562     }
4563   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4564 					       need_ref_all);
4565   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4566 
4567 
4568   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4569      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4570      def-use update cycles for the pointer: one relative to the outer-loop
4571      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4572      to the inner-loop (which is the inner-most loop containing the dataref),
4573      and this is done be step (5) below.
4574 
4575      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4576      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4577      redundant.  Steps (3),(4) create the following:
4578 
4579 	vp0 = &base_addr;
4580 	LOOP:	vp1 = phi(vp0,vp2)
4581 		...
4582 		...
4583 		vp2 = vp1 + step
4584 		goto LOOP
4585 
4586      If there is an inner-loop nested in loop, then step (5) will also be
4587      applied, and an additional update in the inner-loop will be created:
4588 
4589 	vp0 = &base_addr;
4590 	LOOP:   vp1 = phi(vp0,vp2)
4591 		...
4592         inner:     vp3 = phi(vp1,vp4)
4593 	           vp4 = vp3 + inner_step
4594 	           if () goto inner
4595 		...
4596 		vp2 = vp1 + step
4597 		if () goto LOOP   */
4598 
4599   /* (2) Calculate the initial address of the aggregate-pointer, and set
4600      the aggregate-pointer to point to it before the loop.  */
4601 
4602   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4603 
4604   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4605 						   offset, loop, byte_offset);
4606   if (new_stmt_list)
4607     {
4608       if (pe)
4609         {
4610           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4611           gcc_assert (!new_bb);
4612         }
4613       else
4614         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4615     }
4616 
4617   *initial_address = new_temp;
4618   aggr_ptr_init = new_temp;
4619 
4620   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4621      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4622      inner-loop nested in LOOP (during outer-loop vectorization).  */
4623 
4624   /* No update in loop is required.  */
4625   if (only_init && (!loop_vinfo || at_loop == loop))
4626     aptr = aggr_ptr_init;
4627   else
4628     {
4629       /* The step of the aggregate pointer is the type size.  */
4630       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4631       /* One exception to the above is when the scalar step of the load in
4632 	 LOOP is zero. In this case the step here is also zero.  */
4633       if (*inv_p)
4634 	iv_step = size_zero_node;
4635       else if (tree_int_cst_sgn (step) == -1)
4636 	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4637 
4638       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4639 
4640       create_iv (aggr_ptr_init,
4641 		 fold_convert (aggr_ptr_type, iv_step),
4642 		 aggr_ptr, loop, &incr_gsi, insert_after,
4643 		 &indx_before_incr, &indx_after_incr);
4644       incr = gsi_stmt (incr_gsi);
4645       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4646 
4647       /* Copy the points-to information if it exists. */
4648       if (DR_PTR_INFO (dr))
4649 	{
4650 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4651 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4652 	}
4653       if (ptr_incr)
4654 	*ptr_incr = incr;
4655 
4656       aptr = indx_before_incr;
4657     }
4658 
4659   if (!nested_in_vect_loop || only_init)
4660     return aptr;
4661 
4662 
4663   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4664      nested in LOOP, if exists.  */
4665 
4666   gcc_assert (nested_in_vect_loop);
4667   if (!only_init)
4668     {
4669       standard_iv_increment_position (containing_loop, &incr_gsi,
4670 				      &insert_after);
4671       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4672 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4673 		 &indx_after_incr);
4674       incr = gsi_stmt (incr_gsi);
4675       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4676 
4677       /* Copy the points-to information if it exists. */
4678       if (DR_PTR_INFO (dr))
4679 	{
4680 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4681 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4682 	}
4683       if (ptr_incr)
4684 	*ptr_incr = incr;
4685 
4686       return indx_before_incr;
4687     }
4688   else
4689     gcc_unreachable ();
4690 }
4691 
4692 
4693 /* Function bump_vector_ptr
4694 
4695    Increment a pointer (to a vector type) by vector-size. If requested,
4696    i.e. if PTR-INCR is given, then also connect the new increment stmt
4697    to the existing def-use update-chain of the pointer, by modifying
4698    the PTR_INCR as illustrated below:
4699 
4700    The pointer def-use update-chain before this function:
4701                         DATAREF_PTR = phi (p_0, p_2)
4702                         ....
4703         PTR_INCR:       p_2 = DATAREF_PTR + step
4704 
4705    The pointer def-use update-chain after this function:
4706                         DATAREF_PTR = phi (p_0, p_2)
4707                         ....
4708                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4709                         ....
4710         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4711 
4712    Input:
4713    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4714                  in the loop.
4715    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4716 	      the loop.  The increment amount across iterations is expected
4717 	      to be vector_size.
4718    BSI - location where the new update stmt is to be placed.
4719    STMT - the original scalar memory-access stmt that is being vectorized.
4720    BUMP - optional. The offset by which to bump the pointer. If not given,
4721 	  the offset is assumed to be vector_size.
4722 
4723    Output: Return NEW_DATAREF_PTR as illustrated above.
4724 
4725 */
4726 
4727 tree
4728 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4729 		 gimple *stmt, tree bump)
4730 {
4731   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4732   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4733   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4734   tree update = TYPE_SIZE_UNIT (vectype);
4735   gassign *incr_stmt;
4736   ssa_op_iter iter;
4737   use_operand_p use_p;
4738   tree new_dataref_ptr;
4739 
4740   if (bump)
4741     update = bump;
4742 
4743   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4744     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4745   else
4746     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4747   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4748 				   dataref_ptr, update);
4749   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4750 
4751   /* Copy the points-to information if it exists. */
4752   if (DR_PTR_INFO (dr))
4753     {
4754       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4755       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4756     }
4757 
4758   if (!ptr_incr)
4759     return new_dataref_ptr;
4760 
4761   /* Update the vector-pointer's cross-iteration increment.  */
4762   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4763     {
4764       tree use = USE_FROM_PTR (use_p);
4765 
4766       if (use == dataref_ptr)
4767         SET_USE (use_p, new_dataref_ptr);
4768       else
4769         gcc_assert (tree_int_cst_compare (use, update) == 0);
4770     }
4771 
4772   return new_dataref_ptr;
4773 }
4774 
4775 
4776 /* Function vect_create_destination_var.
4777 
4778    Create a new temporary of type VECTYPE.  */
4779 
4780 tree
4781 vect_create_destination_var (tree scalar_dest, tree vectype)
4782 {
4783   tree vec_dest;
4784   const char *name;
4785   char *new_name;
4786   tree type;
4787   enum vect_var_kind kind;
4788 
4789   kind = vectype
4790     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4791     ? vect_mask_var
4792     : vect_simple_var
4793     : vect_scalar_var;
4794   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4795 
4796   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4797 
4798   name = get_name (scalar_dest);
4799   if (name)
4800     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4801   else
4802     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4803   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4804   free (new_name);
4805 
4806   return vec_dest;
4807 }
4808 
4809 /* Function vect_grouped_store_supported.
4810 
4811    Returns TRUE if interleave high and interleave low permutations
4812    are supported, and FALSE otherwise.  */
4813 
4814 bool
4815 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4816 {
4817   machine_mode mode = TYPE_MODE (vectype);
4818 
4819   /* vect_permute_store_chain requires the group size to be equal to 3 or
4820      be a power of two.  */
4821   if (count != 3 && exact_log2 (count) == -1)
4822     {
4823       if (dump_enabled_p ())
4824 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4825 			 "the size of the group of accesses"
4826 			 " is not a power of 2 or not eqaul to 3\n");
4827       return false;
4828     }
4829 
4830   /* Check that the permutation is supported.  */
4831   if (VECTOR_MODE_P (mode))
4832     {
4833       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4834       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4835 
4836       if (count == 3)
4837 	{
4838 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
4839 	  unsigned int i, j;
4840 
4841 	  for (j = 0; j < 3; j++)
4842 	    {
4843 	      int nelt0 = ((3 - j) * nelt) % 3;
4844 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
4845 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
4846 	      for (i = 0; i < nelt; i++)
4847 		{
4848 		  if (3 * i + nelt0 < nelt)
4849 		    sel[3 * i + nelt0] = j0++;
4850 		  if (3 * i + nelt1 < nelt)
4851 		    sel[3 * i + nelt1] = nelt + j1++;
4852 		  if (3 * i + nelt2 < nelt)
4853 		    sel[3 * i + nelt2] = 0;
4854 		}
4855 	      if (!can_vec_perm_p (mode, false, sel))
4856 		{
4857 		  if (dump_enabled_p ())
4858 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4859 				 "permutaion op not supported by target.\n");
4860 		  return false;
4861 		}
4862 
4863 	      for (i = 0; i < nelt; i++)
4864 		{
4865 		  if (3 * i + nelt0 < nelt)
4866 		    sel[3 * i + nelt0] = 3 * i + nelt0;
4867 		  if (3 * i + nelt1 < nelt)
4868 		    sel[3 * i + nelt1] = 3 * i + nelt1;
4869 		  if (3 * i + nelt2 < nelt)
4870 		    sel[3 * i + nelt2] = nelt + j2++;
4871 		}
4872 	      if (!can_vec_perm_p (mode, false, sel))
4873 		{
4874 		  if (dump_enabled_p ())
4875 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4876 				 "permutaion op not supported by target.\n");
4877 		  return false;
4878 		}
4879 	    }
4880 	  return true;
4881 	}
4882       else
4883 	{
4884 	  /* If length is not equal to 3 then only power of 2 is supported.  */
4885 	  gcc_assert (pow2p_hwi (count));
4886 
4887 	  for (i = 0; i < nelt / 2; i++)
4888 	    {
4889 	      sel[i * 2] = i;
4890 	      sel[i * 2 + 1] = i + nelt;
4891 	    }
4892 	    if (can_vec_perm_p (mode, false, sel))
4893 	      {
4894 		for (i = 0; i < nelt; i++)
4895 		  sel[i] += nelt / 2;
4896 		if (can_vec_perm_p (mode, false, sel))
4897 		  return true;
4898 	      }
4899 	}
4900     }
4901 
4902   if (dump_enabled_p ())
4903     dump_printf (MSG_MISSED_OPTIMIZATION,
4904 		 "permutaion op not supported by target.\n");
4905   return false;
4906 }
4907 
4908 
4909 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4910    type VECTYPE.  */
4911 
4912 bool
4913 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4914 {
4915   return vect_lanes_optab_supported_p ("vec_store_lanes",
4916 				       vec_store_lanes_optab,
4917 				       vectype, count);
4918 }
4919 
4920 
4921 /* Function vect_permute_store_chain.
4922 
4923    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4924    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4925    the data correctly for the stores.  Return the final references for stores
4926    in RESULT_CHAIN.
4927 
4928    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4929    The input is 4 vectors each containing 8 elements.  We assign a number to
4930    each element, the input sequence is:
4931 
4932    1st vec:   0  1  2  3  4  5  6  7
4933    2nd vec:   8  9 10 11 12 13 14 15
4934    3rd vec:  16 17 18 19 20 21 22 23
4935    4th vec:  24 25 26 27 28 29 30 31
4936 
4937    The output sequence should be:
4938 
4939    1st vec:  0  8 16 24  1  9 17 25
4940    2nd vec:  2 10 18 26  3 11 19 27
4941    3rd vec:  4 12 20 28  5 13 21 30
4942    4th vec:  6 14 22 30  7 15 23 31
4943 
4944    i.e., we interleave the contents of the four vectors in their order.
4945 
4946    We use interleave_high/low instructions to create such output.  The input of
4947    each interleave_high/low operation is two vectors:
4948    1st vec    2nd vec
4949    0 1 2 3    4 5 6 7
4950    the even elements of the result vector are obtained left-to-right from the
4951    high/low elements of the first vector.  The odd elements of the result are
4952    obtained left-to-right from the high/low elements of the second vector.
4953    The output of interleave_high will be:   0 4 1 5
4954    and of interleave_low:                   2 6 3 7
4955 
4956 
4957    The permutation is done in log LENGTH stages.  In each stage interleave_high
4958    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4959    where the first argument is taken from the first half of DR_CHAIN and the
4960    second argument from it's second half.
4961    In our example,
4962 
4963    I1: interleave_high (1st vec, 3rd vec)
4964    I2: interleave_low (1st vec, 3rd vec)
4965    I3: interleave_high (2nd vec, 4th vec)
4966    I4: interleave_low (2nd vec, 4th vec)
4967 
4968    The output for the first stage is:
4969 
4970    I1:  0 16  1 17  2 18  3 19
4971    I2:  4 20  5 21  6 22  7 23
4972    I3:  8 24  9 25 10 26 11 27
4973    I4: 12 28 13 29 14 30 15 31
4974 
4975    The output of the second stage, i.e. the final result is:
4976 
4977    I1:  0  8 16 24  1  9 17 25
4978    I2:  2 10 18 26  3 11 19 27
4979    I3:  4 12 20 28  5 13 21 30
4980    I4:  6 14 22 30  7 15 23 31.  */
4981 
4982 void
4983 vect_permute_store_chain (vec<tree> dr_chain,
4984 			  unsigned int length,
4985 			  gimple *stmt,
4986 			  gimple_stmt_iterator *gsi,
4987 			  vec<tree> *result_chain)
4988 {
4989   tree vect1, vect2, high, low;
4990   gimple *perm_stmt;
4991   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4992   tree perm_mask_low, perm_mask_high;
4993   tree data_ref;
4994   tree perm3_mask_low, perm3_mask_high;
4995   unsigned int i, n, log_length = exact_log2 (length);
4996   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4997   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4998 
4999   result_chain->quick_grow (length);
5000   memcpy (result_chain->address (), dr_chain.address (),
5001 	  length * sizeof (tree));
5002 
5003   if (length == 3)
5004     {
5005       unsigned int j0 = 0, j1 = 0, j2 = 0;
5006 
5007       for (j = 0; j < 3; j++)
5008         {
5009 	  int nelt0 = ((3 - j) * nelt) % 3;
5010 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5011 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5012 
5013 	  for (i = 0; i < nelt; i++)
5014 	    {
5015 	      if (3 * i + nelt0 < nelt)
5016 		sel[3 * i + nelt0] = j0++;
5017 	      if (3 * i + nelt1 < nelt)
5018 		sel[3 * i + nelt1] = nelt + j1++;
5019 	      if (3 * i + nelt2 < nelt)
5020 		sel[3 * i + nelt2] = 0;
5021 	    }
5022 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5023 
5024 	  for (i = 0; i < nelt; i++)
5025 	    {
5026 	      if (3 * i + nelt0 < nelt)
5027 		sel[3 * i + nelt0] = 3 * i + nelt0;
5028 	      if (3 * i + nelt1 < nelt)
5029 		sel[3 * i + nelt1] = 3 * i + nelt1;
5030 	      if (3 * i + nelt2 < nelt)
5031 		sel[3 * i + nelt2] = nelt + j2++;
5032 	    }
5033 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5034 
5035 	  vect1 = dr_chain[0];
5036 	  vect2 = dr_chain[1];
5037 
5038 	  /* Create interleaving stmt:
5039 	     low = VEC_PERM_EXPR <vect1, vect2,
5040 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5041 				   j + 2, nelt + j + 2, *, ...}>  */
5042 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5043 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5044 					   vect2, perm3_mask_low);
5045 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5046 
5047 	  vect1 = data_ref;
5048 	  vect2 = dr_chain[2];
5049 	  /* Create interleaving stmt:
5050 	     low = VEC_PERM_EXPR <vect1, vect2,
5051 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5052 				   6, 7, nelt + j + 2, ...}>  */
5053 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5054 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5055 					   vect2, perm3_mask_high);
5056 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5057 	  (*result_chain)[j] = data_ref;
5058 	}
5059     }
5060   else
5061     {
5062       /* If length is not equal to 3 then only power of 2 is supported.  */
5063       gcc_assert (pow2p_hwi (length));
5064 
5065       for (i = 0, n = nelt / 2; i < n; i++)
5066 	{
5067 	  sel[i * 2] = i;
5068 	  sel[i * 2 + 1] = i + nelt;
5069 	}
5070 	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5071 
5072 	for (i = 0; i < nelt; i++)
5073 	  sel[i] += nelt / 2;
5074 	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5075 
5076 	for (i = 0, n = log_length; i < n; i++)
5077 	  {
5078 	    for (j = 0; j < length/2; j++)
5079 	      {
5080 		vect1 = dr_chain[j];
5081 		vect2 = dr_chain[j+length/2];
5082 
5083 		/* Create interleaving stmt:
5084 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5085 							...}>  */
5086 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5087 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5088 						 vect2, perm_mask_high);
5089 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5090 		(*result_chain)[2*j] = high;
5091 
5092 		/* Create interleaving stmt:
5093 		   low = VEC_PERM_EXPR <vect1, vect2,
5094 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5095 					 ...}>  */
5096 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5097 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5098 						 vect2, perm_mask_low);
5099 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5100 		(*result_chain)[2*j+1] = low;
5101 	      }
5102 	    memcpy (dr_chain.address (), result_chain->address (),
5103 		    length * sizeof (tree));
5104 	  }
5105     }
5106 }
5107 
5108 /* Function vect_setup_realignment
5109 
5110    This function is called when vectorizing an unaligned load using
5111    the dr_explicit_realign[_optimized] scheme.
5112    This function generates the following code at the loop prolog:
5113 
5114       p = initial_addr;
5115    x  msq_init = *(floor(p));   # prolog load
5116       realignment_token = call target_builtin;
5117     loop:
5118    x  msq = phi (msq_init, ---)
5119 
5120    The stmts marked with x are generated only for the case of
5121    dr_explicit_realign_optimized.
5122 
5123    The code above sets up a new (vector) pointer, pointing to the first
5124    location accessed by STMT, and a "floor-aligned" load using that pointer.
5125    It also generates code to compute the "realignment-token" (if the relevant
5126    target hook was defined), and creates a phi-node at the loop-header bb
5127    whose arguments are the result of the prolog-load (created by this
5128    function) and the result of a load that takes place in the loop (to be
5129    created by the caller to this function).
5130 
5131    For the case of dr_explicit_realign_optimized:
5132    The caller to this function uses the phi-result (msq) to create the
5133    realignment code inside the loop, and sets up the missing phi argument,
5134    as follows:
5135     loop:
5136       msq = phi (msq_init, lsq)
5137       lsq = *(floor(p'));        # load in loop
5138       result = realign_load (msq, lsq, realignment_token);
5139 
5140    For the case of dr_explicit_realign:
5141     loop:
5142       msq = *(floor(p)); 	# load in loop
5143       p' = p + (VS-1);
5144       lsq = *(floor(p'));	# load in loop
5145       result = realign_load (msq, lsq, realignment_token);
5146 
5147    Input:
5148    STMT - (scalar) load stmt to be vectorized. This load accesses
5149           a memory location that may be unaligned.
5150    BSI - place where new code is to be inserted.
5151    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5152 			      is used.
5153 
5154    Output:
5155    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5156                        target hook, if defined.
5157    Return value - the result of the loop-header phi node.  */
5158 
5159 tree
5160 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5161                         tree *realignment_token,
5162 			enum dr_alignment_support alignment_support_scheme,
5163 			tree init_addr,
5164 			struct loop **at_loop)
5165 {
5166   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5167   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5168   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5169   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5170   struct loop *loop = NULL;
5171   edge pe = NULL;
5172   tree scalar_dest = gimple_assign_lhs (stmt);
5173   tree vec_dest;
5174   gimple *inc;
5175   tree ptr;
5176   tree data_ref;
5177   basic_block new_bb;
5178   tree msq_init = NULL_TREE;
5179   tree new_temp;
5180   gphi *phi_stmt;
5181   tree msq = NULL_TREE;
5182   gimple_seq stmts = NULL;
5183   bool inv_p;
5184   bool compute_in_loop = false;
5185   bool nested_in_vect_loop = false;
5186   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5187   struct loop *loop_for_initial_load = NULL;
5188 
5189   if (loop_vinfo)
5190     {
5191       loop = LOOP_VINFO_LOOP (loop_vinfo);
5192       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5193     }
5194 
5195   gcc_assert (alignment_support_scheme == dr_explicit_realign
5196 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5197 
5198   /* We need to generate three things:
5199      1. the misalignment computation
5200      2. the extra vector load (for the optimized realignment scheme).
5201      3. the phi node for the two vectors from which the realignment is
5202       done (for the optimized realignment scheme).  */
5203 
5204   /* 1. Determine where to generate the misalignment computation.
5205 
5206      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5207      calculation will be generated by this function, outside the loop (in the
5208      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5209      caller, inside the loop.
5210 
5211      Background: If the misalignment remains fixed throughout the iterations of
5212      the loop, then both realignment schemes are applicable, and also the
5213      misalignment computation can be done outside LOOP.  This is because we are
5214      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5215      are a multiple of VS (the Vector Size), and therefore the misalignment in
5216      different vectorized LOOP iterations is always the same.
5217      The problem arises only if the memory access is in an inner-loop nested
5218      inside LOOP, which is now being vectorized using outer-loop vectorization.
5219      This is the only case when the misalignment of the memory access may not
5220      remain fixed throughout the iterations of the inner-loop (as explained in
5221      detail in vect_supportable_dr_alignment).  In this case, not only is the
5222      optimized realignment scheme not applicable, but also the misalignment
5223      computation (and generation of the realignment token that is passed to
5224      REALIGN_LOAD) have to be done inside the loop.
5225 
5226      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5227      or not, which in turn determines if the misalignment is computed inside
5228      the inner-loop, or outside LOOP.  */
5229 
5230   if (init_addr != NULL_TREE || !loop_vinfo)
5231     {
5232       compute_in_loop = true;
5233       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5234     }
5235 
5236 
5237   /* 2. Determine where to generate the extra vector load.
5238 
5239      For the optimized realignment scheme, instead of generating two vector
5240      loads in each iteration, we generate a single extra vector load in the
5241      preheader of the loop, and in each iteration reuse the result of the
5242      vector load from the previous iteration.  In case the memory access is in
5243      an inner-loop nested inside LOOP, which is now being vectorized using
5244      outer-loop vectorization, we need to determine whether this initial vector
5245      load should be generated at the preheader of the inner-loop, or can be
5246      generated at the preheader of LOOP.  If the memory access has no evolution
5247      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5248      to be generated inside LOOP (in the preheader of the inner-loop).  */
5249 
5250   if (nested_in_vect_loop)
5251     {
5252       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5253       bool invariant_in_outerloop =
5254             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5255       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5256     }
5257   else
5258     loop_for_initial_load = loop;
5259   if (at_loop)
5260     *at_loop = loop_for_initial_load;
5261 
5262   if (loop_for_initial_load)
5263     pe = loop_preheader_edge (loop_for_initial_load);
5264 
5265   /* 3. For the case of the optimized realignment, create the first vector
5266       load at the loop preheader.  */
5267 
5268   if (alignment_support_scheme == dr_explicit_realign_optimized)
5269     {
5270       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5271       gassign *new_stmt;
5272 
5273       gcc_assert (!compute_in_loop);
5274       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5275       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5276 				      NULL_TREE, &init_addr, NULL, &inc,
5277 				      true, &inv_p);
5278       if (TREE_CODE (ptr) == SSA_NAME)
5279 	new_temp = copy_ssa_name (ptr);
5280       else
5281 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5282       new_stmt = gimple_build_assign
5283 		   (new_temp, BIT_AND_EXPR, ptr,
5284 		    build_int_cst (TREE_TYPE (ptr),
5285 				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5286       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5287       gcc_assert (!new_bb);
5288       data_ref
5289 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5290 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5291       new_stmt = gimple_build_assign (vec_dest, data_ref);
5292       new_temp = make_ssa_name (vec_dest, new_stmt);
5293       gimple_assign_set_lhs (new_stmt, new_temp);
5294       if (pe)
5295         {
5296           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5297           gcc_assert (!new_bb);
5298         }
5299       else
5300          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5301 
5302       msq_init = gimple_assign_lhs (new_stmt);
5303     }
5304 
5305   /* 4. Create realignment token using a target builtin, if available.
5306       It is done either inside the containing loop, or before LOOP (as
5307       determined above).  */
5308 
5309   if (targetm.vectorize.builtin_mask_for_load)
5310     {
5311       gcall *new_stmt;
5312       tree builtin_decl;
5313 
5314       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5315       if (!init_addr)
5316 	{
5317 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5318 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5319 							NULL_TREE, loop);
5320           if (loop)
5321             {
5322    	      pe = loop_preheader_edge (loop);
5323 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5324 	      gcc_assert (!new_bb);
5325             }
5326           else
5327              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5328 	}
5329 
5330       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5331       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5332       vec_dest =
5333 	vect_create_destination_var (scalar_dest,
5334 				     gimple_call_return_type (new_stmt));
5335       new_temp = make_ssa_name (vec_dest, new_stmt);
5336       gimple_call_set_lhs (new_stmt, new_temp);
5337 
5338       if (compute_in_loop)
5339 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5340       else
5341 	{
5342 	  /* Generate the misalignment computation outside LOOP.  */
5343 	  pe = loop_preheader_edge (loop);
5344 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5345 	  gcc_assert (!new_bb);
5346 	}
5347 
5348       *realignment_token = gimple_call_lhs (new_stmt);
5349 
5350       /* The result of the CALL_EXPR to this builtin is determined from
5351          the value of the parameter and no global variables are touched
5352          which makes the builtin a "const" function.  Requiring the
5353          builtin to have the "const" attribute makes it unnecessary
5354          to call mark_call_clobbered.  */
5355       gcc_assert (TREE_READONLY (builtin_decl));
5356     }
5357 
5358   if (alignment_support_scheme == dr_explicit_realign)
5359     return msq;
5360 
5361   gcc_assert (!compute_in_loop);
5362   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5363 
5364 
5365   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5366 
5367   pe = loop_preheader_edge (containing_loop);
5368   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5369   msq = make_ssa_name (vec_dest);
5370   phi_stmt = create_phi_node (msq, containing_loop->header);
5371   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5372 
5373   return msq;
5374 }
5375 
5376 
5377 /* Function vect_grouped_load_supported.
5378 
5379    COUNT is the size of the load group (the number of statements plus the
5380    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5381    only one statement, with a gap of COUNT - 1.
5382 
5383    Returns true if a suitable permute exists.  */
5384 
5385 bool
5386 vect_grouped_load_supported (tree vectype, bool single_element_p,
5387 			     unsigned HOST_WIDE_INT count)
5388 {
5389   machine_mode mode = TYPE_MODE (vectype);
5390 
5391   /* If this is single-element interleaving with an element distance
5392      that leaves unused vector loads around punt - we at least create
5393      very sub-optimal code in that case (and blow up memory,
5394      see PR65518).  */
5395   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5396     {
5397       if (dump_enabled_p ())
5398 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 			 "single-element interleaving not supported "
5400 			 "for not adjacent vector loads\n");
5401       return false;
5402     }
5403 
5404   /* vect_permute_load_chain requires the group size to be equal to 3 or
5405      be a power of two.  */
5406   if (count != 3 && exact_log2 (count) == -1)
5407     {
5408       if (dump_enabled_p ())
5409 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5410 			 "the size of the group of accesses"
5411 			 " is not a power of 2 or not equal to 3\n");
5412       return false;
5413     }
5414 
5415   /* Check that the permutation is supported.  */
5416   if (VECTOR_MODE_P (mode))
5417     {
5418       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5419       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5420 
5421       if (count == 3)
5422 	{
5423 	  unsigned int k;
5424 	  for (k = 0; k < 3; k++)
5425 	    {
5426 	      for (i = 0; i < nelt; i++)
5427 		if (3 * i + k < 2 * nelt)
5428 		  sel[i] = 3 * i + k;
5429 		else
5430 		  sel[i] = 0;
5431 	      if (!can_vec_perm_p (mode, false, sel))
5432 		{
5433 		  if (dump_enabled_p ())
5434 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5435 				     "shuffle of 3 loads is not supported by"
5436 				     " target\n");
5437 		  return false;
5438 		}
5439 	      for (i = 0, j = 0; i < nelt; i++)
5440 		if (3 * i + k < 2 * nelt)
5441 		  sel[i] = i;
5442 		else
5443 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5444 	      if (!can_vec_perm_p (mode, false, sel))
5445 		{
5446 		  if (dump_enabled_p ())
5447 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5448 				     "shuffle of 3 loads is not supported by"
5449 				     " target\n");
5450 		  return false;
5451 		}
5452 	    }
5453 	  return true;
5454 	}
5455       else
5456 	{
5457 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5458 	  gcc_assert (pow2p_hwi (count));
5459 	  for (i = 0; i < nelt; i++)
5460 	    sel[i] = i * 2;
5461 	  if (can_vec_perm_p (mode, false, sel))
5462 	    {
5463 	      for (i = 0; i < nelt; i++)
5464 		sel[i] = i * 2 + 1;
5465 	      if (can_vec_perm_p (mode, false, sel))
5466 		return true;
5467 	    }
5468         }
5469     }
5470 
5471   if (dump_enabled_p ())
5472     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5473 		     "extract even/odd not supported by target\n");
5474   return false;
5475 }
5476 
5477 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5478    type VECTYPE.  */
5479 
5480 bool
5481 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5482 {
5483   return vect_lanes_optab_supported_p ("vec_load_lanes",
5484 				       vec_load_lanes_optab,
5485 				       vectype, count);
5486 }
5487 
5488 /* Function vect_permute_load_chain.
5489 
5490    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5491    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5492    the input data correctly.  Return the final references for loads in
5493    RESULT_CHAIN.
5494 
5495    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5496    The input is 4 vectors each containing 8 elements. We assign a number to each
5497    element, the input sequence is:
5498 
5499    1st vec:   0  1  2  3  4  5  6  7
5500    2nd vec:   8  9 10 11 12 13 14 15
5501    3rd vec:  16 17 18 19 20 21 22 23
5502    4th vec:  24 25 26 27 28 29 30 31
5503 
5504    The output sequence should be:
5505 
5506    1st vec:  0 4  8 12 16 20 24 28
5507    2nd vec:  1 5  9 13 17 21 25 29
5508    3rd vec:  2 6 10 14 18 22 26 30
5509    4th vec:  3 7 11 15 19 23 27 31
5510 
5511    i.e., the first output vector should contain the first elements of each
5512    interleaving group, etc.
5513 
5514    We use extract_even/odd instructions to create such output.  The input of
5515    each extract_even/odd operation is two vectors
5516    1st vec    2nd vec
5517    0 1 2 3    4 5 6 7
5518 
5519    and the output is the vector of extracted even/odd elements.  The output of
5520    extract_even will be:   0 2 4 6
5521    and of extract_odd:     1 3 5 7
5522 
5523 
5524    The permutation is done in log LENGTH stages.  In each stage extract_even
5525    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5526    their order.  In our example,
5527 
5528    E1: extract_even (1st vec, 2nd vec)
5529    E2: extract_odd (1st vec, 2nd vec)
5530    E3: extract_even (3rd vec, 4th vec)
5531    E4: extract_odd (3rd vec, 4th vec)
5532 
5533    The output for the first stage will be:
5534 
5535    E1:  0  2  4  6  8 10 12 14
5536    E2:  1  3  5  7  9 11 13 15
5537    E3: 16 18 20 22 24 26 28 30
5538    E4: 17 19 21 23 25 27 29 31
5539 
5540    In order to proceed and create the correct sequence for the next stage (or
5541    for the correct output, if the second stage is the last one, as in our
5542    example), we first put the output of extract_even operation and then the
5543    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5544    The input for the second stage is:
5545 
5546    1st vec (E1):  0  2  4  6  8 10 12 14
5547    2nd vec (E3): 16 18 20 22 24 26 28 30
5548    3rd vec (E2):  1  3  5  7  9 11 13 15
5549    4th vec (E4): 17 19 21 23 25 27 29 31
5550 
5551    The output of the second stage:
5552 
5553    E1: 0 4  8 12 16 20 24 28
5554    E2: 2 6 10 14 18 22 26 30
5555    E3: 1 5  9 13 17 21 25 29
5556    E4: 3 7 11 15 19 23 27 31
5557 
5558    And RESULT_CHAIN after reordering:
5559 
5560    1st vec (E1):  0 4  8 12 16 20 24 28
5561    2nd vec (E3):  1 5  9 13 17 21 25 29
5562    3rd vec (E2):  2 6 10 14 18 22 26 30
5563    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5564 
5565 static void
5566 vect_permute_load_chain (vec<tree> dr_chain,
5567 			 unsigned int length,
5568 			 gimple *stmt,
5569 			 gimple_stmt_iterator *gsi,
5570 			 vec<tree> *result_chain)
5571 {
5572   tree data_ref, first_vect, second_vect;
5573   tree perm_mask_even, perm_mask_odd;
5574   tree perm3_mask_low, perm3_mask_high;
5575   gimple *perm_stmt;
5576   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5577   unsigned int i, j, log_length = exact_log2 (length);
5578   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5579   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5580 
5581   result_chain->quick_grow (length);
5582   memcpy (result_chain->address (), dr_chain.address (),
5583 	  length * sizeof (tree));
5584 
5585   if (length == 3)
5586     {
5587       unsigned int k;
5588 
5589       for (k = 0; k < 3; k++)
5590 	{
5591 	  for (i = 0; i < nelt; i++)
5592 	    if (3 * i + k < 2 * nelt)
5593 	      sel[i] = 3 * i + k;
5594 	    else
5595 	      sel[i] = 0;
5596 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5597 
5598 	  for (i = 0, j = 0; i < nelt; i++)
5599 	    if (3 * i + k < 2 * nelt)
5600 	      sel[i] = i;
5601 	    else
5602 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5603 
5604 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5605 
5606 	  first_vect = dr_chain[0];
5607 	  second_vect = dr_chain[1];
5608 
5609 	  /* Create interleaving stmt (low part of):
5610 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5611 							     ...}>  */
5612 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5613 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5614 					   second_vect, perm3_mask_low);
5615 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5616 
5617 	  /* Create interleaving stmt (high part of):
5618 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5619 							      ...}>  */
5620 	  first_vect = data_ref;
5621 	  second_vect = dr_chain[2];
5622 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5623 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5624 					   second_vect, perm3_mask_high);
5625 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5626 	  (*result_chain)[k] = data_ref;
5627 	}
5628     }
5629   else
5630     {
5631       /* If length is not equal to 3 then only power of 2 is supported.  */
5632       gcc_assert (pow2p_hwi (length));
5633 
5634       for (i = 0; i < nelt; ++i)
5635 	sel[i] = i * 2;
5636       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5637 
5638       for (i = 0; i < nelt; ++i)
5639 	sel[i] = i * 2 + 1;
5640       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5641 
5642       for (i = 0; i < log_length; i++)
5643 	{
5644 	  for (j = 0; j < length; j += 2)
5645 	    {
5646 	      first_vect = dr_chain[j];
5647 	      second_vect = dr_chain[j+1];
5648 
5649 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5650 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5651 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5652 					       first_vect, second_vect,
5653 					       perm_mask_even);
5654 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5655 	      (*result_chain)[j/2] = data_ref;
5656 
5657 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5658 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5659 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5660 					       first_vect, second_vect,
5661 					       perm_mask_odd);
5662 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5663 	      (*result_chain)[j/2+length/2] = data_ref;
5664 	    }
5665 	  memcpy (dr_chain.address (), result_chain->address (),
5666 		  length * sizeof (tree));
5667 	}
5668     }
5669 }
5670 
5671 /* Function vect_shift_permute_load_chain.
5672 
5673    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5674    sequence of stmts to reorder the input data accordingly.
5675    Return the final references for loads in RESULT_CHAIN.
5676    Return true if successed, false otherwise.
5677 
5678    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5679    The input is 3 vectors each containing 8 elements.  We assign a
5680    number to each element, the input sequence is:
5681 
5682    1st vec:   0  1  2  3  4  5  6  7
5683    2nd vec:   8  9 10 11 12 13 14 15
5684    3rd vec:  16 17 18 19 20 21 22 23
5685 
5686    The output sequence should be:
5687 
5688    1st vec:  0 3 6  9 12 15 18 21
5689    2nd vec:  1 4 7 10 13 16 19 22
5690    3rd vec:  2 5 8 11 14 17 20 23
5691 
5692    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5693 
5694    First we shuffle all 3 vectors to get correct elements order:
5695 
5696    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5697    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5698    3rd vec:  (16 19 22) (17 20 23) (18 21)
5699 
5700    Next we unite and shift vector 3 times:
5701 
5702    1st step:
5703      shift right by 6 the concatenation of:
5704      "1st vec" and  "2nd vec"
5705        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5706      "2nd vec" and  "3rd vec"
5707        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5708      "3rd vec" and  "1st vec"
5709        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5710 			     | New vectors                   |
5711 
5712      So that now new vectors are:
5713 
5714      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5715      2nd vec:  (10 13) (16 19 22) (17 20 23)
5716      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5717 
5718    2nd step:
5719      shift right by 5 the concatenation of:
5720      "1st vec" and  "3rd vec"
5721        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5722      "2nd vec" and  "1st vec"
5723        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5724      "3rd vec" and  "2nd vec"
5725        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5726 			  | New vectors                   |
5727 
5728      So that now new vectors are:
5729 
5730      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5731      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5732      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5733 
5734    3rd step:
5735      shift right by 5 the concatenation of:
5736      "1st vec" and  "1st vec"
5737        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5738      shift right by 3 the concatenation of:
5739      "2nd vec" and  "2nd vec"
5740                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5741 			  | New vectors                   |
5742 
5743      So that now all vectors are READY:
5744      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5745      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5746      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5747 
5748    This algorithm is faster than one in vect_permute_load_chain if:
5749      1.  "shift of a concatination" is faster than general permutation.
5750 	 This is usually so.
5751      2.  The TARGET machine can't execute vector instructions in parallel.
5752 	 This is because each step of the algorithm depends on previous.
5753 	 The algorithm in vect_permute_load_chain is much more parallel.
5754 
5755    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5756 */
5757 
5758 static bool
5759 vect_shift_permute_load_chain (vec<tree> dr_chain,
5760 			       unsigned int length,
5761 			       gimple *stmt,
5762 			       gimple_stmt_iterator *gsi,
5763 			       vec<tree> *result_chain)
5764 {
5765   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5766   tree perm2_mask1, perm2_mask2, perm3_mask;
5767   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5768   gimple *perm_stmt;
5769 
5770   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5771   unsigned int i;
5772   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5773   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5774   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5775   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5776 
5777   result_chain->quick_grow (length);
5778   memcpy (result_chain->address (), dr_chain.address (),
5779 	  length * sizeof (tree));
5780 
5781   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5782     {
5783       unsigned int j, log_length = exact_log2 (length);
5784       for (i = 0; i < nelt / 2; ++i)
5785 	sel[i] = i * 2;
5786       for (i = 0; i < nelt / 2; ++i)
5787 	sel[nelt / 2 + i] = i * 2 + 1;
5788       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5789 	{
5790 	  if (dump_enabled_p ())
5791 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5792 			     "shuffle of 2 fields structure is not \
5793 			      supported by target\n");
5794 	  return false;
5795 	}
5796       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5797 
5798       for (i = 0; i < nelt / 2; ++i)
5799 	sel[i] = i * 2 + 1;
5800       for (i = 0; i < nelt / 2; ++i)
5801 	sel[nelt / 2 + i] = i * 2;
5802       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5803 	{
5804 	  if (dump_enabled_p ())
5805 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5806 			     "shuffle of 2 fields structure is not \
5807 			      supported by target\n");
5808 	  return false;
5809 	}
5810       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5811 
5812       /* Generating permutation constant to shift all elements.
5813 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5814       for (i = 0; i < nelt; i++)
5815 	sel[i] = nelt / 2 + i;
5816       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5817 	{
5818 	  if (dump_enabled_p ())
5819 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5820 			     "shift permutation is not supported by target\n");
5821 	  return false;
5822 	}
5823       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5824 
5825       /* Generating permutation constant to select vector from 2.
5826 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5827       for (i = 0; i < nelt / 2; i++)
5828 	sel[i] = i;
5829       for (i = nelt / 2; i < nelt; i++)
5830 	sel[i] = nelt + i;
5831       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5832 	{
5833 	  if (dump_enabled_p ())
5834 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5835 			     "select is not supported by target\n");
5836 	  return false;
5837 	}
5838       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5839 
5840       for (i = 0; i < log_length; i++)
5841 	{
5842 	  for (j = 0; j < length; j += 2)
5843 	    {
5844 	      first_vect = dr_chain[j];
5845 	      second_vect = dr_chain[j + 1];
5846 
5847 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5848 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5849 					       first_vect, first_vect,
5850 					       perm2_mask1);
5851 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5852 	      vect[0] = data_ref;
5853 
5854 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5855 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5856 					       second_vect, second_vect,
5857 					       perm2_mask2);
5858 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5859 	      vect[1] = data_ref;
5860 
5861 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5862 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5863 					       vect[0], vect[1], shift1_mask);
5864 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5865 	      (*result_chain)[j/2 + length/2] = data_ref;
5866 
5867 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5868 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5869 					       vect[0], vect[1], select_mask);
5870 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5871 	      (*result_chain)[j/2] = data_ref;
5872 	    }
5873 	  memcpy (dr_chain.address (), result_chain->address (),
5874 		  length * sizeof (tree));
5875 	}
5876       return true;
5877     }
5878   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5879     {
5880       unsigned int k = 0, l = 0;
5881 
5882       /* Generating permutation constant to get all elements in rigth order.
5883 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5884       for (i = 0; i < nelt; i++)
5885 	{
5886 	  if (3 * k + (l % 3) >= nelt)
5887 	    {
5888 	      k = 0;
5889 	      l += (3 - (nelt % 3));
5890 	    }
5891 	  sel[i] = 3 * k + (l % 3);
5892 	  k++;
5893 	}
5894       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5895 	{
5896 	  if (dump_enabled_p ())
5897 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5898 			     "shuffle of 3 fields structure is not \
5899 			      supported by target\n");
5900 	  return false;
5901 	}
5902       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5903 
5904       /* Generating permutation constant to shift all elements.
5905 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5906       for (i = 0; i < nelt; i++)
5907 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5908       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5909 	{
5910 	  if (dump_enabled_p ())
5911 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5912 			     "shift permutation is not supported by target\n");
5913 	  return false;
5914 	}
5915       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5916 
5917       /* Generating permutation constant to shift all elements.
5918 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5919       for (i = 0; i < nelt; i++)
5920 	sel[i] = 2 * (nelt / 3) + 1 + i;
5921       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5922 	{
5923 	  if (dump_enabled_p ())
5924 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5925 			     "shift permutation is not supported by target\n");
5926 	  return false;
5927 	}
5928       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5929 
5930       /* Generating permutation constant to shift all elements.
5931 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5932       for (i = 0; i < nelt; i++)
5933 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5934       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5935 	{
5936 	  if (dump_enabled_p ())
5937 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5938 			     "shift permutation is not supported by target\n");
5939 	  return false;
5940 	}
5941       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5942 
5943       /* Generating permutation constant to shift all elements.
5944 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5945       for (i = 0; i < nelt; i++)
5946 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5947       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5948 	{
5949 	  if (dump_enabled_p ())
5950 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5951 			     "shift permutation is not supported by target\n");
5952 	  return false;
5953 	}
5954       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5955 
5956       for (k = 0; k < 3; k++)
5957 	{
5958 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5959 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960 					   dr_chain[k], dr_chain[k],
5961 					   perm3_mask);
5962 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5963 	  vect[k] = data_ref;
5964 	}
5965 
5966       for (k = 0; k < 3; k++)
5967 	{
5968 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5969 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5970 					   vect[k % 3], vect[(k + 1) % 3],
5971 					   shift1_mask);
5972 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5973 	  vect_shift[k] = data_ref;
5974 	}
5975 
5976       for (k = 0; k < 3; k++)
5977 	{
5978 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5979 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5980 					   vect_shift[(4 - k) % 3],
5981 					   vect_shift[(3 - k) % 3],
5982 					   shift2_mask);
5983 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5984 	  vect[k] = data_ref;
5985 	}
5986 
5987       (*result_chain)[3 - (nelt % 3)] = vect[2];
5988 
5989       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5990       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5991 				       vect[0], shift3_mask);
5992       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5993       (*result_chain)[nelt % 3] = data_ref;
5994 
5995       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5996       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5997 				       vect[1], shift4_mask);
5998       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999       (*result_chain)[0] = data_ref;
6000       return true;
6001     }
6002   return false;
6003 }
6004 
6005 /* Function vect_transform_grouped_load.
6006 
6007    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6008    to perform their permutation and ascribe the result vectorized statements to
6009    the scalar statements.
6010 */
6011 
6012 void
6013 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6014 			     gimple_stmt_iterator *gsi)
6015 {
6016   machine_mode mode;
6017   vec<tree> result_chain = vNULL;
6018 
6019   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6020      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6021      vectors, that are ready for vector computation.  */
6022   result_chain.create (size);
6023 
6024   /* If reassociation width for vector type is 2 or greater target machine can
6025      execute 2 or more vector instructions in parallel.  Otherwise try to
6026      get chain for loads group using vect_shift_permute_load_chain.  */
6027   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6028   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6029       || pow2p_hwi (size)
6030       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6031 					 gsi, &result_chain))
6032     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6033   vect_record_grouped_load_vectors (stmt, result_chain);
6034   result_chain.release ();
6035 }
6036 
6037 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6038    generated as part of the vectorization of STMT.  Assign the statement
6039    for each vector to the associated scalar statement.  */
6040 
6041 void
6042 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6043 {
6044   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6045   gimple *next_stmt, *new_stmt;
6046   unsigned int i, gap_count;
6047   tree tmp_data_ref;
6048 
6049   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6050      Since we scan the chain starting from it's first node, their order
6051      corresponds the order of data-refs in RESULT_CHAIN.  */
6052   next_stmt = first_stmt;
6053   gap_count = 1;
6054   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6055     {
6056       if (!next_stmt)
6057 	break;
6058 
6059       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6060        code elimination pass later.  No need to check for the first stmt in
6061        the group, since it always exists.
6062        GROUP_GAP is the number of steps in elements from the previous
6063        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6064        correspond to the gaps.  */
6065       if (next_stmt != first_stmt
6066           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6067       {
6068         gap_count++;
6069         continue;
6070       }
6071 
6072       while (next_stmt)
6073         {
6074 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6075 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6076 	     copies, and we put the new vector statement in the first available
6077 	     RELATED_STMT.  */
6078 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6079 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6080 	  else
6081             {
6082               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6083                 {
6084 		  gimple *prev_stmt =
6085 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6086 		  gimple *rel_stmt =
6087 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6088 	          while (rel_stmt)
6089 		    {
6090 		      prev_stmt = rel_stmt;
6091 		      rel_stmt =
6092                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6093 		    }
6094 
6095   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6096                     new_stmt;
6097                 }
6098             }
6099 
6100 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6101 	  gap_count = 1;
6102 	  /* If NEXT_STMT accesses the same DR as the previous statement,
6103 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
6104 	     get the next data-ref from RESULT_CHAIN.  */
6105 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6106 	    break;
6107         }
6108     }
6109 }
6110 
6111 /* Function vect_force_dr_alignment_p.
6112 
6113    Returns whether the alignment of a DECL can be forced to be aligned
6114    on ALIGNMENT bit boundary.  */
6115 
6116 bool
6117 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6118 {
6119   if (!VAR_P (decl))
6120     return false;
6121 
6122   if (decl_in_symtab_p (decl)
6123       && !symtab_node::get (decl)->can_increase_alignment_p ())
6124     return false;
6125 
6126   if (TREE_STATIC (decl))
6127     return (alignment <= MAX_OFILE_ALIGNMENT);
6128   else
6129     return (alignment <= MAX_STACK_ALIGNMENT);
6130 }
6131 
6132 
6133 /* Return whether the data reference DR is supported with respect to its
6134    alignment.
6135    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6136    it is aligned, i.e., check if it is possible to vectorize it with different
6137    alignment.  */
6138 
6139 enum dr_alignment_support
6140 vect_supportable_dr_alignment (struct data_reference *dr,
6141                                bool check_aligned_accesses)
6142 {
6143   gimple *stmt = DR_STMT (dr);
6144   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6145   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6146   machine_mode mode = TYPE_MODE (vectype);
6147   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6148   struct loop *vect_loop = NULL;
6149   bool nested_in_vect_loop = false;
6150 
6151   if (aligned_access_p (dr) && !check_aligned_accesses)
6152     return dr_aligned;
6153 
6154   /* For now assume all conditional loads/stores support unaligned
6155      access without any special code.  */
6156   if (is_gimple_call (stmt)
6157       && gimple_call_internal_p (stmt)
6158       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6159 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6160     return dr_unaligned_supported;
6161 
6162   if (loop_vinfo)
6163     {
6164       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6165       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6166     }
6167 
6168   /* Possibly unaligned access.  */
6169 
6170   /* We can choose between using the implicit realignment scheme (generating
6171      a misaligned_move stmt) and the explicit realignment scheme (generating
6172      aligned loads with a REALIGN_LOAD).  There are two variants to the
6173      explicit realignment scheme: optimized, and unoptimized.
6174      We can optimize the realignment only if the step between consecutive
6175      vector loads is equal to the vector size.  Since the vector memory
6176      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6177      is guaranteed that the misalignment amount remains the same throughout the
6178      execution of the vectorized loop.  Therefore, we can create the
6179      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6180      at the loop preheader.
6181 
6182      However, in the case of outer-loop vectorization, when vectorizing a
6183      memory access in the inner-loop nested within the LOOP that is now being
6184      vectorized, while it is guaranteed that the misalignment of the
6185      vectorized memory access will remain the same in different outer-loop
6186      iterations, it is *not* guaranteed that is will remain the same throughout
6187      the execution of the inner-loop.  This is because the inner-loop advances
6188      with the original scalar step (and not in steps of VS).  If the inner-loop
6189      step happens to be a multiple of VS, then the misalignment remains fixed
6190      and we can use the optimized realignment scheme.  For example:
6191 
6192       for (i=0; i<N; i++)
6193         for (j=0; j<M; j++)
6194           s += a[i+j];
6195 
6196      When vectorizing the i-loop in the above example, the step between
6197      consecutive vector loads is 1, and so the misalignment does not remain
6198      fixed across the execution of the inner-loop, and the realignment cannot
6199      be optimized (as illustrated in the following pseudo vectorized loop):
6200 
6201       for (i=0; i<N; i+=4)
6202         for (j=0; j<M; j++){
6203           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6204                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6205                          // (assuming that we start from an aligned address).
6206           }
6207 
6208      We therefore have to use the unoptimized realignment scheme:
6209 
6210       for (i=0; i<N; i+=4)
6211           for (j=k; j<M; j+=4)
6212           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6213                            // that the misalignment of the initial address is
6214                            // 0).
6215 
6216      The loop can then be vectorized as follows:
6217 
6218       for (k=0; k<4; k++){
6219         rt = get_realignment_token (&vp[k]);
6220         for (i=0; i<N; i+=4){
6221           v1 = vp[i+k];
6222           for (j=k; j<M; j+=4){
6223             v2 = vp[i+j+VS-1];
6224             va = REALIGN_LOAD <v1,v2,rt>;
6225             vs += va;
6226             v1 = v2;
6227           }
6228         }
6229     } */
6230 
6231   if (DR_IS_READ (dr))
6232     {
6233       bool is_packed = false;
6234       tree type = (TREE_TYPE (DR_REF (dr)));
6235 
6236       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6237 	  && (!targetm.vectorize.builtin_mask_for_load
6238 	      || targetm.vectorize.builtin_mask_for_load ()))
6239 	{
6240 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6241 
6242 	  /* If we are doing SLP then the accesses need not have the
6243 	     same alignment, instead it depends on the SLP group size.  */
6244 	  if (loop_vinfo
6245 	      && STMT_SLP_TYPE (stmt_info)
6246 	      && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6247 		  * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6248 		  % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6249 	    ;
6250 	  else if (!loop_vinfo
6251 		   || (nested_in_vect_loop
6252 		       && (TREE_INT_CST_LOW (DR_STEP (dr))
6253 			   != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6254 	    return dr_explicit_realign;
6255 	  else
6256 	    return dr_explicit_realign_optimized;
6257 	}
6258       if (!known_alignment_for_access_p (dr))
6259 	is_packed = not_size_aligned (DR_REF (dr));
6260 
6261       if (targetm.vectorize.support_vector_misalignment
6262 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6263 	/* Can't software pipeline the loads, but can at least do them.  */
6264 	return dr_unaligned_supported;
6265     }
6266   else
6267     {
6268       bool is_packed = false;
6269       tree type = (TREE_TYPE (DR_REF (dr)));
6270 
6271       if (!known_alignment_for_access_p (dr))
6272 	is_packed = not_size_aligned (DR_REF (dr));
6273 
6274      if (targetm.vectorize.support_vector_misalignment
6275 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6276        return dr_unaligned_supported;
6277     }
6278 
6279   /* Unsupported.  */
6280   return dr_unaligned_unsupported;
6281 }
6282