xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-data-refs.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
56 
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 			      tree vectype, unsigned HOST_WIDE_INT count)
60 {
61   machine_mode mode, array_mode;
62   bool limit_p;
63 
64   mode = TYPE_MODE (vectype);
65   limit_p = !targetm.array_mode_supported_p (mode, count);
66   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 			      MODE_INT, limit_p);
68 
69   if (array_mode == BLKmode)
70     {
71       if (dump_enabled_p ())
72 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74                          GET_MODE_NAME (mode), count);
75       return false;
76     }
77 
78   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
79     {
80       if (dump_enabled_p ())
81 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82                          "cannot use %s<%s><%s>\n", name,
83                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84       return false;
85     }
86 
87   if (dump_enabled_p ())
88     dump_printf_loc (MSG_NOTE, vect_location,
89                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90                      GET_MODE_NAME (mode));
91 
92   return true;
93 }
94 
95 
96 /* Return the smallest scalar part of STMT.
97    This is used to determine the vectype of the stmt.  We generally set the
98    vectype according to the type of the result (lhs).  For stmts whose
99    result-type is different than the type of the arguments (e.g., demotion,
100    promotion), vectype will be reset appropriately (later).  Note that we have
101    to visit the smallest datatype in this function, because that determines the
102    VF.  If the smallest datatype in the loop is present only as the rhs of a
103    promotion operation - we'd miss it.
104    Such a case, where a variable of this datatype does not appear in the lhs
105    anywhere in the loop, can only occur if it's an invariant: e.g.:
106    'int_x = (int) short_inv', which we'd expect to have been optimized away by
107    invariant motion.  However, we cannot rely on invariant motion to always
108    take invariants out of the loop, and so in the case of promotion we also
109    have to check the rhs.
110    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111    types.  */
112 
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115                                HOST_WIDE_INT *rhs_size_unit)
116 {
117   tree scalar_type = gimple_expr_type (stmt);
118   HOST_WIDE_INT lhs, rhs;
119 
120   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
121 
122   if (is_gimple_assign (stmt)
123       && (gimple_assign_cast_p (stmt)
124           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
127     {
128       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
129 
130       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131       if (rhs < lhs)
132         scalar_type = rhs_type;
133     }
134 
135   *lhs_size_unit = lhs;
136   *rhs_size_unit = rhs;
137   return scalar_type;
138 }
139 
140 
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142    tested at run-time.  Return TRUE if DDR was successfully inserted.
143    Return false if versioning is not supported.  */
144 
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
147 {
148   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
149 
150   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151     return false;
152 
153   if (dump_enabled_p ())
154     {
155       dump_printf_loc (MSG_NOTE, vect_location,
156                        "mark for run-time aliasing test between ");
157       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158       dump_printf (MSG_NOTE,  " and ");
159       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160       dump_printf (MSG_NOTE, "\n");
161     }
162 
163   if (optimize_loop_nest_for_size_p (loop))
164     {
165       if (dump_enabled_p ())
166 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167                          "versioning not supported when optimizing"
168                          " for size.\n");
169       return false;
170     }
171 
172   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
173   if (loop->inner)
174     {
175       if (dump_enabled_p ())
176 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177                          "versioning not yet supported for outer-loops.\n");
178       return false;
179     }
180 
181   /* FORNOW: We don't support creating runtime alias tests for non-constant
182      step.  */
183   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
185     {
186       if (dump_enabled_p ())
187 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188                          "versioning not yet supported for non-constant "
189                          "step\n");
190       return false;
191     }
192 
193   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194   return true;
195 }
196 
197 
198 /* Return true if we know that the order of vectorized STMT_A and
199    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
200    At least one of the statements is a write.  */
201 
202 static bool
203 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
204 {
205   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
206   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
207 
208   /* Single statements are always kept in their original order.  */
209   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
210       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
211     return true;
212 
213   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
214      group are emitted at the position of the last scalar load and all
215      stores in a group are emitted at the position of the last scalar store.
216      Compute that position and check whether the resulting order matches
217      the current one.  */
218   gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
219   if (last_a)
220     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
221 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
222       last_a = get_later_stmt (last_a, s);
223   else
224     last_a = stmt_a;
225   gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
226   if (last_b)
227     for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
228 	 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
229       last_b = get_later_stmt (last_b, s);
230   else
231     last_b = stmt_b;
232   return ((get_later_stmt (last_a, last_b) == last_a)
233 	  == (get_later_stmt (stmt_a, stmt_b) == stmt_a));
234 }
235 
236 
237 /* Function vect_analyze_data_ref_dependence.
238 
239    Return TRUE if there (might) exist a dependence between a memory-reference
240    DRA and a memory-reference DRB.  When versioning for alias may check a
241    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
242    the data dependence.  */
243 
244 static bool
245 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
246                                   loop_vec_info loop_vinfo, int *max_vf)
247 {
248   unsigned int i;
249   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
250   struct data_reference *dra = DDR_A (ddr);
251   struct data_reference *drb = DDR_B (ddr);
252   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
253   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
254   lambda_vector dist_v;
255   unsigned int loop_depth;
256 
257   /* In loop analysis all data references should be vectorizable.  */
258   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
259       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
260     gcc_unreachable ();
261 
262   /* Independent data accesses.  */
263   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
264     return false;
265 
266   if (dra == drb
267       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
268     return false;
269 
270   /* We do not have to consider dependences between accesses that belong
271      to the same group.  */
272   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
273       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
274     return false;
275 
276   /* Even if we have an anti-dependence then, as the vectorized loop covers at
277      least two scalar iterations, there is always also a true dependence.
278      As the vectorizer does not re-order loads and stores we can ignore
279      the anti-dependence if TBAA can disambiguate both DRs similar to the
280      case with known negative distance anti-dependences (positive
281      distance anti-dependences would violate TBAA constraints).  */
282   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
283        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
284       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
285 				 get_alias_set (DR_REF (drb))))
286     return false;
287 
288   /* Unknown data dependence.  */
289   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
290     {
291       /* If user asserted safelen consecutive iterations can be
292 	 executed concurrently, assume independence.  */
293       if (loop->safelen >= 2)
294 	{
295 	  if (loop->safelen < *max_vf)
296 	    *max_vf = loop->safelen;
297 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
298 	  return false;
299 	}
300 
301       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
302 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
303 	{
304 	  if (dump_enabled_p ())
305 	    {
306 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
307 			       "versioning for alias not supported for: "
308 			       "can't determine dependence between ");
309 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
310 				 DR_REF (dra));
311 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
312 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
313 				 DR_REF (drb));
314 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
315 	    }
316 	  return true;
317 	}
318 
319       if (dump_enabled_p ())
320 	{
321 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
322 			   "versioning for alias required: "
323 			   "can't determine dependence between ");
324 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
325 			     DR_REF (dra));
326 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
327 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
328 			     DR_REF (drb));
329 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
330 	}
331 
332       /* Add to list of ddrs that need to be tested at run-time.  */
333       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
334     }
335 
336   /* Known data dependence.  */
337   if (DDR_NUM_DIST_VECTS (ddr) == 0)
338     {
339       /* If user asserted safelen consecutive iterations can be
340 	 executed concurrently, assume independence.  */
341       if (loop->safelen >= 2)
342 	{
343 	  if (loop->safelen < *max_vf)
344 	    *max_vf = loop->safelen;
345 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
346 	  return false;
347 	}
348 
349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
350 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
351 	{
352 	  if (dump_enabled_p ())
353 	    {
354 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 			       "versioning for alias not supported for: "
356 			       "bad dist vector for ");
357 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
358 				 DR_REF (dra));
359 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
360 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
361 				 DR_REF (drb));
362 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 	    }
364 	  return true;
365 	}
366 
367       if (dump_enabled_p ())
368         {
369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370                            "versioning for alias required: "
371                            "bad dist vector for ");
372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
373           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
374           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
375           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
376         }
377       /* Add to list of ddrs that need to be tested at run-time.  */
378       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
379     }
380 
381   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
382   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
383     {
384       int dist = dist_v[loop_depth];
385 
386       if (dump_enabled_p ())
387 	dump_printf_loc (MSG_NOTE, vect_location,
388                          "dependence distance  = %d.\n", dist);
389 
390       if (dist == 0)
391 	{
392 	  if (dump_enabled_p ())
393 	    {
394 	      dump_printf_loc (MSG_NOTE, vect_location,
395 	                       "dependence distance == 0 between ");
396 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
397 	      dump_printf (MSG_NOTE, " and ");
398 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
399 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
400 	    }
401 
402 	  /* When we perform grouped accesses and perform implicit CSE
403 	     by detecting equal accesses and doing disambiguation with
404 	     runtime alias tests like for
405 	        .. = a[i];
406 		.. = a[i+1];
407 		a[i] = ..;
408 		a[i+1] = ..;
409 		*p = ..;
410 		.. = a[i];
411 		.. = a[i+1];
412 	     where we will end up loading { a[i], a[i+1] } once, make
413 	     sure that inserting group loads before the first load and
414 	     stores after the last store will do the right thing.
415 	     Similar for groups like
416 	        a[i] = ...;
417 		... = a[i];
418 		a[i+1] = ...;
419 	     where loads from the group interleave with the store.  */
420 	  if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
421 	    {
422 	      if (dump_enabled_p ())
423 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 				 "READ_WRITE dependence in interleaving."
425 				 "\n");
426 	      return true;
427 	    }
428 
429 	  unsigned int step_prec = TYPE_PRECISION (TREE_TYPE (DR_STEP (dra)));
430 	  if (loop->safelen < 2
431 	      && !expr_not_equal_to (DR_STEP (dra), wi::zero (step_prec)))
432 	    {
433 	      if (dump_enabled_p ())
434 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 				 "step could be zero.\n");
436 	      return true;
437 	    }
438 
439 	  continue;
440 	}
441 
442       if (dist > 0 && DDR_REVERSED_P (ddr))
443 	{
444 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
445 	     reversed (to make distance vector positive), and the actual
446 	     distance is negative.  */
447 	  if (dump_enabled_p ())
448 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
449 	                     "dependence distance negative.\n");
450 	  /* Record a negative dependence distance to later limit the
451 	     amount of stmt copying / unrolling we can perform.
452 	     Only need to handle read-after-write dependence.  */
453 	  if (DR_IS_READ (drb)
454 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
455 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
456 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
457 	  continue;
458 	}
459 
460       if (abs (dist) >= 2
461 	  && abs (dist) < *max_vf)
462 	{
463 	  /* The dependence distance requires reduction of the maximal
464 	     vectorization factor.  */
465 	  *max_vf = abs (dist);
466 	  if (dump_enabled_p ())
467 	    dump_printf_loc (MSG_NOTE, vect_location,
468 	                     "adjusting maximal vectorization factor to %i\n",
469 	                     *max_vf);
470 	}
471 
472       if (abs (dist) >= *max_vf)
473 	{
474 	  /* Dependence distance does not create dependence, as far as
475 	     vectorization is concerned, in this case.  */
476 	  if (dump_enabled_p ())
477 	    dump_printf_loc (MSG_NOTE, vect_location,
478 	                     "dependence distance >= VF.\n");
479 	  continue;
480 	}
481 
482       if (dump_enabled_p ())
483 	{
484 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
485 	               "not vectorized, possible dependence "
486 	               "between data-refs ");
487 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
488 	  dump_printf (MSG_NOTE,  " and ");
489 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
490 	  dump_printf (MSG_NOTE,  "\n");
491 	}
492 
493       return true;
494     }
495 
496   return false;
497 }
498 
499 /* Function vect_analyze_data_ref_dependences.
500 
501    Examine all the data references in the loop, and make sure there do not
502    exist any data dependences between them.  Set *MAX_VF according to
503    the maximum vectorization factor the data dependences allow.  */
504 
505 bool
506 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
507 {
508   unsigned int i;
509   struct data_dependence_relation *ddr;
510 
511   if (dump_enabled_p ())
512     dump_printf_loc (MSG_NOTE, vect_location,
513                      "=== vect_analyze_data_ref_dependences ===\n");
514 
515   LOOP_VINFO_DDRS (loop_vinfo)
516     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
517 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
518   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
519   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
520   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
521 				&LOOP_VINFO_DDRS (loop_vinfo),
522 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
523     return false;
524 
525   /* For epilogues we either have no aliases or alias versioning
526      was applied to original loop.  Therefore we may just get max_vf
527      using VF of original loop.  */
528   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
529     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
530   else
531     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
532       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
533 	return false;
534 
535   return true;
536 }
537 
538 
539 /* Function vect_slp_analyze_data_ref_dependence.
540 
541    Return TRUE if there (might) exist a dependence between a memory-reference
542    DRA and a memory-reference DRB.  When versioning for alias may check a
543    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
544    the data dependence.  */
545 
546 static bool
547 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
548 {
549   struct data_reference *dra = DDR_A (ddr);
550   struct data_reference *drb = DDR_B (ddr);
551 
552   /* We need to check dependences of statements marked as unvectorizable
553      as well, they still can prohibit vectorization.  */
554 
555   /* Independent data accesses.  */
556   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
557     return false;
558 
559   if (dra == drb)
560     return false;
561 
562   /* Read-read is OK.  */
563   if (DR_IS_READ (dra) && DR_IS_READ (drb))
564     return false;
565 
566   /* If dra and drb are part of the same interleaving chain consider
567      them independent.  */
568   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
569       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
570 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
571     return false;
572 
573   /* Unknown data dependence.  */
574   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
575     {
576       if  (dump_enabled_p ())
577 	{
578 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
579 			   "can't determine dependence between ");
580 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
581 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
582 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
583 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
584 	}
585     }
586   else if (dump_enabled_p ())
587     {
588       dump_printf_loc (MSG_NOTE, vect_location,
589 		       "determined dependence between ");
590       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
591       dump_printf (MSG_NOTE, " and ");
592       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
593       dump_printf (MSG_NOTE,  "\n");
594     }
595 
596   return true;
597 }
598 
599 
600 /* Analyze dependences involved in the transform of SLP NODE.  STORES
601    contain the vector of scalar stores of this instance if we are
602    disambiguating the loads.  */
603 
604 static bool
605 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
606 				   vec<gimple *> stores, gimple *last_store)
607 {
608   /* This walks over all stmts involved in the SLP load/store done
609      in NODE verifying we can sink them up to the last stmt in the
610      group.  */
611   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
612   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
613     {
614       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
615       if (access == last_access)
616 	continue;
617       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
618       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
619 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
620 	{
621 	  gimple *stmt = gsi_stmt (gsi);
622 	  if (! gimple_vuse (stmt)
623 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
624 	    continue;
625 
626 	  /* If we couldn't record a (single) data reference for this
627 	     stmt we have to give up.  */
628 	  /* ???  Here and below if dependence analysis fails we can resort
629 	     to the alias oracle which can handle more kinds of stmts.  */
630 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
631 	  if (!dr_b)
632 	    return false;
633 
634 	  bool dependent = false;
635 	  /* If we run into a store of this same instance (we've just
636 	     marked those) then delay dependence checking until we run
637 	     into the last store because this is where it will have
638 	     been sunk to (and we verify if we can do that as well).  */
639 	  if (gimple_visited_p (stmt))
640 	    {
641 	      if (stmt != last_store)
642 		continue;
643 	      unsigned i;
644 	      gimple *store;
645 	      FOR_EACH_VEC_ELT (stores, i, store)
646 		{
647 		  data_reference *store_dr
648 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
649 		  ddr_p ddr = initialize_data_dependence_relation
650 				(dr_a, store_dr, vNULL);
651 		  dependent = vect_slp_analyze_data_ref_dependence (ddr);
652 		  free_dependence_relation (ddr);
653 		  if (dependent)
654 		    break;
655 		}
656 	    }
657 	  else
658 	    {
659 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
660 							       dr_b, vNULL);
661 	      dependent = vect_slp_analyze_data_ref_dependence (ddr);
662 	      free_dependence_relation (ddr);
663 	    }
664 	  if (dependent)
665 	    return false;
666 	}
667     }
668   return true;
669 }
670 
671 
672 /* Function vect_analyze_data_ref_dependences.
673 
674    Examine all the data references in the basic-block, and make sure there
675    do not exist any data dependences between them.  Set *MAX_VF according to
676    the maximum vectorization factor the data dependences allow.  */
677 
678 bool
679 vect_slp_analyze_instance_dependence (slp_instance instance)
680 {
681   if (dump_enabled_p ())
682     dump_printf_loc (MSG_NOTE, vect_location,
683                      "=== vect_slp_analyze_instance_dependence ===\n");
684 
685   /* The stores of this instance are at the root of the SLP tree.  */
686   slp_tree store = SLP_INSTANCE_TREE (instance);
687   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
688     store = NULL;
689 
690   /* Verify we can sink stores to the vectorized stmt insert location.  */
691   gimple *last_store = NULL;
692   if (store)
693     {
694       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
695 	return false;
696 
697       /* Mark stores in this instance and remember the last one.  */
698       last_store = vect_find_last_scalar_stmt_in_slp (store);
699       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
700 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
701     }
702 
703   bool res = true;
704 
705   /* Verify we can sink loads to the vectorized stmt insert location,
706      special-casing stores of this instance.  */
707   slp_tree load;
708   unsigned int i;
709   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
710     if (! vect_slp_analyze_node_dependences (instance, load,
711 					     store
712 					     ? SLP_TREE_SCALAR_STMTS (store)
713 					     : vNULL, last_store))
714       {
715 	res = false;
716 	break;
717       }
718 
719   /* Unset the visited flag.  */
720   if (store)
721     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
722       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
723 
724   return res;
725 }
726 
727 /* Function vect_compute_data_ref_alignment
728 
729    Compute the misalignment of the data reference DR.
730 
731    Output:
732    1. If during the misalignment computation it is found that the data reference
733       cannot be vectorized then false is returned.
734    2. DR_MISALIGNMENT (DR) is defined.
735 
736    FOR NOW: No analysis is actually performed. Misalignment is calculated
737    only for trivial cases. TODO.  */
738 
739 bool
740 vect_compute_data_ref_alignment (struct data_reference *dr)
741 {
742   gimple *stmt = DR_STMT (dr);
743   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
744   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
745   struct loop *loop = NULL;
746   tree ref = DR_REF (dr);
747   tree vectype;
748   tree base, base_addr;
749   tree misalign = NULL_TREE;
750   tree aligned_to;
751   tree step;
752   unsigned HOST_WIDE_INT alignment;
753 
754   if (dump_enabled_p ())
755     dump_printf_loc (MSG_NOTE, vect_location,
756                      "vect_compute_data_ref_alignment:\n");
757 
758   if (loop_vinfo)
759     loop = LOOP_VINFO_LOOP (loop_vinfo);
760 
761   /* Initialize misalignment to unknown.  */
762   SET_DR_MISALIGNMENT (dr, -1);
763 
764   if (tree_fits_shwi_p (DR_STEP (dr)))
765     misalign = DR_INIT (dr);
766   aligned_to = DR_ALIGNED_TO (dr);
767   base_addr = DR_BASE_ADDRESS (dr);
768   vectype = STMT_VINFO_VECTYPE (stmt_info);
769 
770   /* In case the dataref is in an inner-loop of the loop that is being
771      vectorized (LOOP), we use the base and misalignment information
772      relative to the outer-loop (LOOP).  This is ok only if the misalignment
773      stays the same throughout the execution of the inner-loop, which is why
774      we have to check that the stride of the dataref in the inner-loop evenly
775      divides by the vector size.  */
776   if (loop && nested_in_vect_loop_p (loop, stmt))
777     {
778       tree step = DR_STEP (dr);
779 
780       if (tree_fits_shwi_p (step)
781 	  && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
782         {
783           if (dump_enabled_p ())
784             dump_printf_loc (MSG_NOTE, vect_location,
785                              "inner step divides the vector-size.\n");
786 	  misalign = STMT_VINFO_DR_INIT (stmt_info);
787 	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
788 	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
789         }
790       else
791 	{
792 	  if (dump_enabled_p ())
793 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 	                     "inner step doesn't divide the vector-size.\n");
795 	  misalign = NULL_TREE;
796 	}
797     }
798 
799   /* Similarly we can only use base and misalignment information relative to
800      an innermost loop if the misalignment stays the same throughout the
801      execution of the loop.  As above, this is the case if the stride of
802      the dataref evenly divides by the vector size.  */
803   else
804     {
805       tree step = DR_STEP (dr);
806       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
807 
808       if (tree_fits_shwi_p (step)
809 	  && ((tree_to_shwi (step) * vf)
810 	      % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
811 	{
812 	  if (dump_enabled_p ())
813 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
814 	                     "step doesn't divide the vector-size.\n");
815 	  misalign = NULL_TREE;
816 	}
817     }
818 
819   /* To look at alignment of the base we have to preserve an inner MEM_REF
820      as that carries alignment information of the actual access.  */
821   base = ref;
822   while (handled_component_p (base))
823     base = TREE_OPERAND (base, 0);
824   unsigned int base_alignment = 0;
825   unsigned HOST_WIDE_INT base_bitpos;
826   get_object_alignment_1 (base, &base_alignment, &base_bitpos);
827   /* As data-ref analysis strips the MEM_REF down to its base operand
828      to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
829      adjust things to make base_alignment valid as the alignment of
830      DR_BASE_ADDRESS.  */
831   if (TREE_CODE (base) == MEM_REF)
832     {
833       /* Note all this only works if DR_BASE_ADDRESS is the same as
834 	 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
835 	 in other offsets.  We need to rework DR to compute the alingment
836 	 of DR_BASE_ADDRESS as long as all information is still available.  */
837       if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
838 	{
839 	  base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
840 	  base_bitpos &= (base_alignment - 1);
841 	}
842       else
843 	base_bitpos = BITS_PER_UNIT;
844     }
845   if (base_bitpos != 0)
846     base_alignment = base_bitpos & -base_bitpos;
847   /* Also look at the alignment of the base address DR analysis
848      computed.  */
849   unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
850   if (base_addr_alignment > base_alignment)
851     base_alignment = base_addr_alignment;
852 
853   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
854     DR_VECT_AUX (dr)->base_element_aligned = true;
855 
856   alignment = TYPE_ALIGN_UNIT (vectype);
857 
858   if ((compare_tree_int (aligned_to, alignment) < 0)
859       || !misalign)
860     {
861       if (dump_enabled_p ())
862 	{
863 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
864 	                   "Unknown alignment for access: ");
865 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
866 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
867 	}
868       return true;
869     }
870 
871   if (base_alignment < TYPE_ALIGN (vectype))
872     {
873       base = base_addr;
874       if (TREE_CODE (base) == ADDR_EXPR)
875 	base = TREE_OPERAND (base, 0);
876       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
877 	{
878 	  if (dump_enabled_p ())
879 	    {
880 	      dump_printf_loc (MSG_NOTE, vect_location,
881 	                       "can't force alignment of ref: ");
882 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
883 	      dump_printf (MSG_NOTE, "\n");
884 	    }
885 	  return true;
886 	}
887 
888       if (DECL_USER_ALIGN (base))
889 	{
890 	  if (dump_enabled_p ())
891 	    {
892 	      dump_printf_loc (MSG_NOTE, vect_location,
893 			       "not forcing alignment of user-aligned "
894 			       "variable: ");
895 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
896 	      dump_printf (MSG_NOTE, "\n");
897 	    }
898 	  return true;
899 	}
900 
901       /* Force the alignment of the decl.
902 	 NOTE: This is the only change to the code we make during
903 	 the analysis phase, before deciding to vectorize the loop.  */
904       if (dump_enabled_p ())
905         {
906           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
907           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
908           dump_printf (MSG_NOTE, "\n");
909         }
910 
911       DR_VECT_AUX (dr)->base_decl = base;
912       DR_VECT_AUX (dr)->base_misaligned = true;
913       DR_VECT_AUX (dr)->base_element_aligned = true;
914     }
915 
916   if (loop && nested_in_vect_loop_p (loop, stmt))
917     step = STMT_VINFO_DR_STEP (stmt_info);
918   else
919     step = DR_STEP (dr);
920   /* If this is a backward running DR then first access in the larger
921      vectype actually is N-1 elements before the address in the DR.
922      Adjust misalign accordingly.  */
923   if (tree_int_cst_sgn (step) < 0)
924     {
925       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
926       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
927 	 otherwise we wouldn't be here.  */
928       offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
929       /* PLUS because STEP was negative.  */
930       misalign = size_binop (PLUS_EXPR, misalign, offset);
931     }
932 
933   SET_DR_MISALIGNMENT (dr,
934 		       wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
935 
936   if (dump_enabled_p ())
937     {
938       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
939                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
940       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
941       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
942     }
943 
944   return true;
945 }
946 
947 
948 /* Function vect_update_misalignment_for_peel
949 
950    DR - the data reference whose misalignment is to be adjusted.
951    DR_PEEL - the data reference whose misalignment is being made
952              zero in the vector loop by the peel.
953    NPEEL - the number of iterations in the peel loop if the misalignment
954            of DR_PEEL is known at compile time.  */
955 
956 static void
957 vect_update_misalignment_for_peel (struct data_reference *dr,
958                                    struct data_reference *dr_peel, int npeel)
959 {
960   unsigned int i;
961   vec<dr_p> same_align_drs;
962   struct data_reference *current_dr;
963   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
964   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
965   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
966   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
967 
968  /* For interleaved data accesses the step in the loop must be multiplied by
969      the size of the interleaving group.  */
970   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
971     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
972   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
973     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
974 
975   /* It can be assumed that the data refs with the same alignment as dr_peel
976      are aligned in the vector loop.  */
977   same_align_drs
978     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
979   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
980     {
981       if (current_dr != dr)
982         continue;
983       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
984                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
985       SET_DR_MISALIGNMENT (dr, 0);
986       return;
987     }
988 
989   if (known_alignment_for_access_p (dr)
990       && known_alignment_for_access_p (dr_peel))
991     {
992       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
993       int misal = DR_MISALIGNMENT (dr);
994       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
995       misal += negative ? -npeel * dr_size : npeel * dr_size;
996       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
997       SET_DR_MISALIGNMENT (dr, misal);
998       return;
999     }
1000 
1001   if (dump_enabled_p ())
1002     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
1003   SET_DR_MISALIGNMENT (dr, -1);
1004 }
1005 
1006 
1007 /* Function verify_data_ref_alignment
1008 
1009    Return TRUE if DR can be handled with respect to alignment.  */
1010 
1011 static bool
1012 verify_data_ref_alignment (data_reference_p dr)
1013 {
1014   enum dr_alignment_support supportable_dr_alignment
1015     = vect_supportable_dr_alignment (dr, false);
1016   if (!supportable_dr_alignment)
1017     {
1018       if (dump_enabled_p ())
1019 	{
1020 	  if (DR_IS_READ (dr))
1021 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022 			     "not vectorized: unsupported unaligned load.");
1023 	  else
1024 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 			     "not vectorized: unsupported unaligned "
1026 			     "store.");
1027 
1028 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1029 			     DR_REF (dr));
1030 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1031 	}
1032       return false;
1033     }
1034 
1035   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1036     dump_printf_loc (MSG_NOTE, vect_location,
1037 		     "Vectorizing an unaligned access.\n");
1038 
1039   return true;
1040 }
1041 
1042 /* Function vect_verify_datarefs_alignment
1043 
1044    Return TRUE if all data references in the loop can be
1045    handled with respect to alignment.  */
1046 
1047 bool
1048 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1049 {
1050   vec<data_reference_p> datarefs = vinfo->datarefs;
1051   struct data_reference *dr;
1052   unsigned int i;
1053 
1054   FOR_EACH_VEC_ELT (datarefs, i, dr)
1055     {
1056       gimple *stmt = DR_STMT (dr);
1057       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1058 
1059       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1060 	continue;
1061 
1062       /* For interleaving, only the alignment of the first access matters.   */
1063       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1064 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1065 	continue;
1066 
1067       /* Strided accesses perform only component accesses, alignment is
1068 	 irrelevant for them.  */
1069       if (STMT_VINFO_STRIDED_P (stmt_info)
1070 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1071 	continue;
1072 
1073       if (! verify_data_ref_alignment (dr))
1074 	return false;
1075     }
1076 
1077   return true;
1078 }
1079 
1080 /* Given an memory reference EXP return whether its alignment is less
1081    than its size.  */
1082 
1083 static bool
1084 not_size_aligned (tree exp)
1085 {
1086   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1087     return true;
1088 
1089   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1090 	  > get_object_alignment (exp));
1091 }
1092 
1093 /* Function vector_alignment_reachable_p
1094 
1095    Return true if vector alignment for DR is reachable by peeling
1096    a few loop iterations.  Return false otherwise.  */
1097 
1098 static bool
1099 vector_alignment_reachable_p (struct data_reference *dr)
1100 {
1101   gimple *stmt = DR_STMT (dr);
1102   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1103   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1104 
1105   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1106     {
1107       /* For interleaved access we peel only if number of iterations in
1108 	 the prolog loop ({VF - misalignment}), is a multiple of the
1109 	 number of the interleaved accesses.  */
1110       int elem_size, mis_in_elements;
1111       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1112 
1113       /* FORNOW: handle only known alignment.  */
1114       if (!known_alignment_for_access_p (dr))
1115 	return false;
1116 
1117       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1118       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1119 
1120       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1121 	return false;
1122     }
1123 
1124   /* If misalignment is known at the compile time then allow peeling
1125      only if natural alignment is reachable through peeling.  */
1126   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1127     {
1128       HOST_WIDE_INT elmsize =
1129 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1130       if (dump_enabled_p ())
1131 	{
1132 	  dump_printf_loc (MSG_NOTE, vect_location,
1133 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1134 	  dump_printf (MSG_NOTE,
1135 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1136 	}
1137       if (DR_MISALIGNMENT (dr) % elmsize)
1138 	{
1139 	  if (dump_enabled_p ())
1140 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 	                     "data size does not divide the misalignment.\n");
1142 	  return false;
1143 	}
1144     }
1145 
1146   if (!known_alignment_for_access_p (dr))
1147     {
1148       tree type = TREE_TYPE (DR_REF (dr));
1149       bool is_packed = not_size_aligned (DR_REF (dr));
1150       if (dump_enabled_p ())
1151 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 	                 "Unknown misalignment, %snaturally aligned\n",
1153 			 is_packed ? "not " : "");
1154       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1155     }
1156 
1157   return true;
1158 }
1159 
1160 
1161 /* Calculate the cost of the memory access represented by DR.  */
1162 
1163 static void
1164 vect_get_data_access_cost (struct data_reference *dr,
1165                            unsigned int *inside_cost,
1166                            unsigned int *outside_cost,
1167 			   stmt_vector_for_cost *body_cost_vec)
1168 {
1169   gimple *stmt = DR_STMT (dr);
1170   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1171   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1172   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1173   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1174   int ncopies = vf / nunits;
1175 
1176   if (DR_IS_READ (dr))
1177     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1178 			NULL, body_cost_vec, false);
1179   else
1180     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1181 
1182   if (dump_enabled_p ())
1183     dump_printf_loc (MSG_NOTE, vect_location,
1184                      "vect_get_data_access_cost: inside_cost = %d, "
1185                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1186 }
1187 
1188 
1189 typedef struct _vect_peel_info
1190 {
1191   struct data_reference *dr;
1192   int npeel;
1193   unsigned int count;
1194 } *vect_peel_info;
1195 
1196 typedef struct _vect_peel_extended_info
1197 {
1198   struct _vect_peel_info peel_info;
1199   unsigned int inside_cost;
1200   unsigned int outside_cost;
1201   stmt_vector_for_cost body_cost_vec;
1202 } *vect_peel_extended_info;
1203 
1204 
1205 /* Peeling hashtable helpers.  */
1206 
1207 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1208 {
1209   static inline hashval_t hash (const _vect_peel_info *);
1210   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1211 };
1212 
1213 inline hashval_t
1214 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1215 {
1216   return (hashval_t) peel_info->npeel;
1217 }
1218 
1219 inline bool
1220 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1221 {
1222   return (a->npeel == b->npeel);
1223 }
1224 
1225 
1226 /* Insert DR into peeling hash table with NPEEL as key.  */
1227 
1228 static void
1229 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1230 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1231                           int npeel)
1232 {
1233   struct _vect_peel_info elem, *slot;
1234   _vect_peel_info **new_slot;
1235   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1236 
1237   elem.npeel = npeel;
1238   slot = peeling_htab->find (&elem);
1239   if (slot)
1240     slot->count++;
1241   else
1242     {
1243       slot = XNEW (struct _vect_peel_info);
1244       slot->npeel = npeel;
1245       slot->dr = dr;
1246       slot->count = 1;
1247       new_slot = peeling_htab->find_slot (slot, INSERT);
1248       *new_slot = slot;
1249     }
1250 
1251   if (!supportable_dr_alignment
1252       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1253     slot->count += VECT_MAX_COST;
1254 }
1255 
1256 
1257 /* Traverse peeling hash table to find peeling option that aligns maximum
1258    number of data accesses.  */
1259 
1260 int
1261 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1262 				     _vect_peel_extended_info *max)
1263 {
1264   vect_peel_info elem = *slot;
1265 
1266   if (elem->count > max->peel_info.count
1267       || (elem->count == max->peel_info.count
1268           && max->peel_info.npeel > elem->npeel))
1269     {
1270       max->peel_info.npeel = elem->npeel;
1271       max->peel_info.count = elem->count;
1272       max->peel_info.dr = elem->dr;
1273     }
1274 
1275   return 1;
1276 }
1277 
1278 
1279 /* Traverse peeling hash table and calculate cost for each peeling option.
1280    Find the one with the lowest cost.  */
1281 
1282 int
1283 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1284 				   _vect_peel_extended_info *min)
1285 {
1286   vect_peel_info elem = *slot;
1287   int save_misalignment, dummy;
1288   unsigned int inside_cost = 0, outside_cost = 0, i;
1289   gimple *stmt = DR_STMT (elem->dr);
1290   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1291   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1292   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1293   struct data_reference *dr;
1294   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1295 
1296   prologue_cost_vec.create (2);
1297   body_cost_vec.create (2);
1298   epilogue_cost_vec.create (2);
1299 
1300   FOR_EACH_VEC_ELT (datarefs, i, dr)
1301     {
1302       stmt = DR_STMT (dr);
1303       stmt_info = vinfo_for_stmt (stmt);
1304       /* For interleaving, only the alignment of the first access
1305          matters.  */
1306       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1307           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1308         continue;
1309 
1310       /* Strided accesses perform only component accesses, alignment is
1311          irrelevant for them.  */
1312       if (STMT_VINFO_STRIDED_P (stmt_info)
1313 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1314 	continue;
1315 
1316       save_misalignment = DR_MISALIGNMENT (dr);
1317       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1318       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1319 				 &body_cost_vec);
1320       SET_DR_MISALIGNMENT (dr, save_misalignment);
1321     }
1322 
1323   outside_cost += vect_get_known_peeling_cost
1324     (loop_vinfo, elem->npeel, &dummy,
1325      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1326      &prologue_cost_vec, &epilogue_cost_vec);
1327 
1328   /* Prologue and epilogue costs are added to the target model later.
1329      These costs depend only on the scalar iteration cost, the
1330      number of peeling iterations finally chosen, and the number of
1331      misaligned statements.  So discard the information found here.  */
1332   prologue_cost_vec.release ();
1333   epilogue_cost_vec.release ();
1334 
1335   if (inside_cost < min->inside_cost
1336       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1337     {
1338       min->inside_cost = inside_cost;
1339       min->outside_cost = outside_cost;
1340       min->body_cost_vec.release ();
1341       min->body_cost_vec = body_cost_vec;
1342       min->peel_info.dr = elem->dr;
1343       min->peel_info.npeel = elem->npeel;
1344     }
1345   else
1346     body_cost_vec.release ();
1347 
1348   return 1;
1349 }
1350 
1351 
1352 /* Choose best peeling option by traversing peeling hash table and either
1353    choosing an option with the lowest cost (if cost model is enabled) or the
1354    option that aligns as many accesses as possible.  */
1355 
1356 static struct data_reference *
1357 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1358 				       loop_vec_info loop_vinfo,
1359                                        unsigned int *npeel,
1360 				       stmt_vector_for_cost *body_cost_vec)
1361 {
1362    struct _vect_peel_extended_info res;
1363 
1364    res.peel_info.dr = NULL;
1365    res.body_cost_vec = stmt_vector_for_cost ();
1366 
1367    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1368      {
1369        res.inside_cost = INT_MAX;
1370        res.outside_cost = INT_MAX;
1371        peeling_htab->traverse <_vect_peel_extended_info *,
1372 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1373      }
1374    else
1375      {
1376        res.peel_info.count = 0;
1377        peeling_htab->traverse <_vect_peel_extended_info *,
1378 	   		       vect_peeling_hash_get_most_frequent> (&res);
1379      }
1380 
1381    *npeel = res.peel_info.npeel;
1382    *body_cost_vec = res.body_cost_vec;
1383    return res.peel_info.dr;
1384 }
1385 
1386 
1387 /* Function vect_enhance_data_refs_alignment
1388 
1389    This pass will use loop versioning and loop peeling in order to enhance
1390    the alignment of data references in the loop.
1391 
1392    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1393    original loop is to be vectorized.  Any other loops that are created by
1394    the transformations performed in this pass - are not supposed to be
1395    vectorized.  This restriction will be relaxed.
1396 
1397    This pass will require a cost model to guide it whether to apply peeling
1398    or versioning or a combination of the two.  For example, the scheme that
1399    intel uses when given a loop with several memory accesses, is as follows:
1400    choose one memory access ('p') which alignment you want to force by doing
1401    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1402    other accesses are not necessarily aligned, or (2) use loop versioning to
1403    generate one loop in which all accesses are aligned, and another loop in
1404    which only 'p' is necessarily aligned.
1405 
1406    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1407    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1408    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1409 
1410    Devising a cost model is the most critical aspect of this work.  It will
1411    guide us on which access to peel for, whether to use loop versioning, how
1412    many versions to create, etc.  The cost model will probably consist of
1413    generic considerations as well as target specific considerations (on
1414    powerpc for example, misaligned stores are more painful than misaligned
1415    loads).
1416 
1417    Here are the general steps involved in alignment enhancements:
1418 
1419      -- original loop, before alignment analysis:
1420 	for (i=0; i<N; i++){
1421 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1422 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1423 	}
1424 
1425      -- After vect_compute_data_refs_alignment:
1426 	for (i=0; i<N; i++){
1427 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1428 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1429 	}
1430 
1431      -- Possibility 1: we do loop versioning:
1432      if (p is aligned) {
1433 	for (i=0; i<N; i++){	# loop 1A
1434 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1435 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1436 	}
1437      }
1438      else {
1439 	for (i=0; i<N; i++){	# loop 1B
1440 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1441 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1442 	}
1443      }
1444 
1445      -- Possibility 2: we do loop peeling:
1446      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1447 	x = q[i];
1448 	p[i] = y;
1449      }
1450      for (i = 3; i < N; i++){	# loop 2A
1451 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1452 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1453      }
1454 
1455      -- Possibility 3: combination of loop peeling and versioning:
1456      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1457 	x = q[i];
1458 	p[i] = y;
1459      }
1460      if (p is aligned) {
1461 	for (i = 3; i<N; i++){	# loop 3A
1462 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1463 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1464 	}
1465      }
1466      else {
1467 	for (i = 3; i<N; i++){	# loop 3B
1468 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1469 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1470 	}
1471      }
1472 
1473      These loops are later passed to loop_transform to be vectorized.  The
1474      vectorizer will use the alignment information to guide the transformation
1475      (whether to generate regular loads/stores, or with special handling for
1476      misalignment).  */
1477 
1478 bool
1479 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1480 {
1481   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1482   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1483   enum dr_alignment_support supportable_dr_alignment;
1484   struct data_reference *dr0 = NULL, *first_store = NULL;
1485   struct data_reference *dr;
1486   unsigned int i, j;
1487   bool do_peeling = false;
1488   bool do_versioning = false;
1489   bool stat;
1490   gimple *stmt;
1491   stmt_vec_info stmt_info;
1492   unsigned int npeel = 0;
1493   bool all_misalignments_unknown = true;
1494   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1495   unsigned possible_npeel_number = 1;
1496   tree vectype;
1497   unsigned int nelements, mis, same_align_drs_max = 0;
1498   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1499   hash_table<peel_info_hasher> peeling_htab (1);
1500 
1501   if (dump_enabled_p ())
1502     dump_printf_loc (MSG_NOTE, vect_location,
1503                      "=== vect_enhance_data_refs_alignment ===\n");
1504 
1505   /* Reset data so we can safely be called multiple times.  */
1506   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1507   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1508 
1509   /* While cost model enhancements are expected in the future, the high level
1510      view of the code at this time is as follows:
1511 
1512      A) If there is a misaligned access then see if peeling to align
1513         this access can make all data references satisfy
1514         vect_supportable_dr_alignment.  If so, update data structures
1515         as needed and return true.
1516 
1517      B) If peeling wasn't possible and there is a data reference with an
1518         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1519         then see if loop versioning checks can be used to make all data
1520         references satisfy vect_supportable_dr_alignment.  If so, update
1521         data structures as needed and return true.
1522 
1523      C) If neither peeling nor versioning were successful then return false if
1524         any data reference does not satisfy vect_supportable_dr_alignment.
1525 
1526      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1527 
1528      Note, Possibility 3 above (which is peeling and versioning together) is not
1529      being done at this time.  */
1530 
1531   /* (1) Peeling to force alignment.  */
1532 
1533   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1534      Considerations:
1535      + How many accesses will become aligned due to the peeling
1536      - How many accesses will become unaligned due to the peeling,
1537        and the cost of misaligned accesses.
1538      - The cost of peeling (the extra runtime checks, the increase
1539        in code size).  */
1540 
1541   FOR_EACH_VEC_ELT (datarefs, i, dr)
1542     {
1543       stmt = DR_STMT (dr);
1544       stmt_info = vinfo_for_stmt (stmt);
1545 
1546       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1547 	continue;
1548 
1549       /* For interleaving, only the alignment of the first access
1550          matters.  */
1551       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1552           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1553         continue;
1554 
1555       /* For invariant accesses there is nothing to enhance.  */
1556       if (integer_zerop (DR_STEP (dr)))
1557 	continue;
1558 
1559       /* Strided accesses perform only component accesses, alignment is
1560 	 irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 	continue;
1564 
1565       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1566       do_peeling = vector_alignment_reachable_p (dr);
1567       if (do_peeling)
1568         {
1569           if (known_alignment_for_access_p (dr))
1570             {
1571               unsigned int npeel_tmp;
1572 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1573 						    size_zero_node) < 0;
1574 
1575               /* Save info about DR in the hash table.  */
1576               vectype = STMT_VINFO_VECTYPE (stmt_info);
1577               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1578               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1579                                                 TREE_TYPE (DR_REF (dr))));
1580               npeel_tmp = (negative
1581 			   ? (mis - nelements) : (nelements - mis))
1582 		  & (nelements - 1);
1583 
1584               /* For multiple types, it is possible that the bigger type access
1585                  will have more than one peeling option.  E.g., a loop with two
1586                  types: one of size (vector size / 4), and the other one of
1587                  size (vector size / 8).  Vectorization factor will 8.  If both
1588                  access are misaligned by 3, the first one needs one scalar
1589                  iteration to be aligned, and the second one needs 5.  But the
1590 		 first one will be aligned also by peeling 5 scalar
1591                  iterations, and in that case both accesses will be aligned.
1592                  Hence, except for the immediate peeling amount, we also want
1593                  to try to add full vector size, while we don't exceed
1594                  vectorization factor.
1595                  We do this automatically for cost model, since we calculate cost
1596                  for every peeling option.  */
1597               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1598 		{
1599 		  if (STMT_SLP_TYPE (stmt_info))
1600 		    possible_npeel_number
1601 		      = (vf * GROUP_SIZE (stmt_info)) / nelements;
1602 		  else
1603 		    possible_npeel_number = vf / nelements;
1604 		}
1605 
1606               /* Handle the aligned case. We may decide to align some other
1607                  access, making DR unaligned.  */
1608               if (DR_MISALIGNMENT (dr) == 0)
1609                 {
1610                   npeel_tmp = 0;
1611                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1612                     possible_npeel_number++;
1613                 }
1614 
1615               for (j = 0; j < possible_npeel_number; j++)
1616                 {
1617                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1618 					    dr, npeel_tmp);
1619                   npeel_tmp += nelements;
1620                 }
1621 
1622               all_misalignments_unknown = false;
1623               /* Data-ref that was chosen for the case that all the
1624                  misalignments are unknown is not relevant anymore, since we
1625                  have a data-ref with known alignment.  */
1626               dr0 = NULL;
1627             }
1628           else
1629             {
1630               /* If we don't know any misalignment values, we prefer
1631                  peeling for data-ref that has the maximum number of data-refs
1632                  with the same alignment, unless the target prefers to align
1633                  stores over load.  */
1634               if (all_misalignments_unknown)
1635                 {
1636 		  unsigned same_align_drs
1637 		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1638                   if (!dr0
1639 		      || same_align_drs_max < same_align_drs)
1640                     {
1641                       same_align_drs_max = same_align_drs;
1642                       dr0 = dr;
1643                     }
1644 		  /* For data-refs with the same number of related
1645 		     accesses prefer the one where the misalign
1646 		     computation will be invariant in the outermost loop.  */
1647 		  else if (same_align_drs_max == same_align_drs)
1648 		    {
1649 		      struct loop *ivloop0, *ivloop;
1650 		      ivloop0 = outermost_invariant_loop_for_expr
1651 			  (loop, DR_BASE_ADDRESS (dr0));
1652 		      ivloop = outermost_invariant_loop_for_expr
1653 			  (loop, DR_BASE_ADDRESS (dr));
1654 		      if ((ivloop && !ivloop0)
1655 			  || (ivloop && ivloop0
1656 			      && flow_loop_nested_p (ivloop, ivloop0)))
1657 			dr0 = dr;
1658 		    }
1659 
1660                   if (!first_store && DR_IS_WRITE (dr))
1661                     first_store = dr;
1662                 }
1663 
1664               /* If there are both known and unknown misaligned accesses in the
1665                  loop, we choose peeling amount according to the known
1666                  accesses.  */
1667               if (!supportable_dr_alignment)
1668                 {
1669                   dr0 = dr;
1670                   if (!first_store && DR_IS_WRITE (dr))
1671                     first_store = dr;
1672                 }
1673             }
1674         }
1675       else
1676         {
1677           if (!aligned_access_p (dr))
1678             {
1679               if (dump_enabled_p ())
1680                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681                                  "vector alignment may not be reachable\n");
1682               break;
1683             }
1684         }
1685     }
1686 
1687   /* Check if we can possibly peel the loop.  */
1688   if (!vect_can_advance_ivs_p (loop_vinfo)
1689       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1690       || loop->inner)
1691     do_peeling = false;
1692 
1693   if (do_peeling
1694       && all_misalignments_unknown
1695       && vect_supportable_dr_alignment (dr0, false))
1696     {
1697       /* Check if the target requires to prefer stores over loads, i.e., if
1698          misaligned stores are more expensive than misaligned loads (taking
1699          drs with same alignment into account).  */
1700       if (first_store && DR_IS_READ (dr0))
1701         {
1702           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1703           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1704           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1705           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1706 	  stmt_vector_for_cost dummy;
1707 	  dummy.create (2);
1708 
1709           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1710 				     &dummy);
1711           vect_get_data_access_cost (first_store, &store_inside_cost,
1712 				     &store_outside_cost, &dummy);
1713 
1714 	  dummy.release ();
1715 
1716           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1717              aligning the load DR0).  */
1718           load_inside_penalty = store_inside_cost;
1719           load_outside_penalty = store_outside_cost;
1720           for (i = 0;
1721 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1722 			  DR_STMT (first_store))).iterate (i, &dr);
1723                i++)
1724             if (DR_IS_READ (dr))
1725               {
1726                 load_inside_penalty += load_inside_cost;
1727                 load_outside_penalty += load_outside_cost;
1728               }
1729             else
1730               {
1731                 load_inside_penalty += store_inside_cost;
1732                 load_outside_penalty += store_outside_cost;
1733               }
1734 
1735           /* Calculate the penalty for leaving DR0 unaligned (by
1736              aligning the FIRST_STORE).  */
1737           store_inside_penalty = load_inside_cost;
1738           store_outside_penalty = load_outside_cost;
1739           for (i = 0;
1740 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1741 		      DR_STMT (dr0))).iterate (i, &dr);
1742                i++)
1743             if (DR_IS_READ (dr))
1744               {
1745                 store_inside_penalty += load_inside_cost;
1746                 store_outside_penalty += load_outside_cost;
1747               }
1748             else
1749               {
1750                 store_inside_penalty += store_inside_cost;
1751                 store_outside_penalty += store_outside_cost;
1752               }
1753 
1754           if (load_inside_penalty > store_inside_penalty
1755               || (load_inside_penalty == store_inside_penalty
1756                   && load_outside_penalty > store_outside_penalty))
1757             dr0 = first_store;
1758         }
1759 
1760       /* In case there are only loads with different unknown misalignments, use
1761          peeling only if it may help to align other accesses in the loop or
1762 	 if it may help improving load bandwith when we'd end up using
1763 	 unaligned loads.  */
1764       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1765       if (!first_store
1766 	  && !STMT_VINFO_SAME_ALIGN_REFS (
1767 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1768 	  && (vect_supportable_dr_alignment (dr0, false)
1769 	      != dr_unaligned_supported
1770 	      || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1771 		  == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1772         do_peeling = false;
1773     }
1774 
1775   if (do_peeling && !dr0)
1776     {
1777       /* Peeling is possible, but there is no data access that is not supported
1778          unless aligned. So we try to choose the best possible peeling.  */
1779 
1780       /* We should get here only if there are drs with known misalignment.  */
1781       gcc_assert (!all_misalignments_unknown);
1782 
1783       /* Choose the best peeling from the hash table.  */
1784       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1785 						   loop_vinfo, &npeel,
1786 						   &body_cost_vec);
1787       if (!dr0 || !npeel)
1788         do_peeling = false;
1789     }
1790 
1791   if (do_peeling)
1792     {
1793       stmt = DR_STMT (dr0);
1794       stmt_info = vinfo_for_stmt (stmt);
1795       vectype = STMT_VINFO_VECTYPE (stmt_info);
1796       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1797 
1798       if (known_alignment_for_access_p (dr0))
1799         {
1800 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1801 						size_zero_node) < 0;
1802           if (!npeel)
1803             {
1804               /* Since it's known at compile time, compute the number of
1805                  iterations in the peeled loop (the peeling factor) for use in
1806                  updating DR_MISALIGNMENT values.  The peeling factor is the
1807                  vectorization factor minus the misalignment as an element
1808                  count.  */
1809               mis = DR_MISALIGNMENT (dr0);
1810               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1811               npeel = ((negative ? mis - nelements : nelements - mis)
1812 		       & (nelements - 1));
1813             }
1814 
1815 	  /* For interleaved data access every iteration accesses all the
1816 	     members of the group, therefore we divide the number of iterations
1817 	     by the group size.  */
1818 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1819 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1820 	    npeel /= GROUP_SIZE (stmt_info);
1821 
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_NOTE, vect_location,
1824                              "Try peeling by %d\n", npeel);
1825         }
1826 
1827       /* Ensure that all data refs can be vectorized after the peel.  */
1828       FOR_EACH_VEC_ELT (datarefs, i, dr)
1829         {
1830           int save_misalignment;
1831 
1832 	  if (dr == dr0)
1833 	    continue;
1834 
1835 	  stmt = DR_STMT (dr);
1836 	  stmt_info = vinfo_for_stmt (stmt);
1837 	  /* For interleaving, only the alignment of the first access
1838             matters.  */
1839 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1840 	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1841 	    continue;
1842 
1843 	  /* Strided accesses perform only component accesses, alignment is
1844 	     irrelevant for them.  */
1845 	  if (STMT_VINFO_STRIDED_P (stmt_info)
1846 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1847 	    continue;
1848 
1849 	  save_misalignment = DR_MISALIGNMENT (dr);
1850 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1851 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1852 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1853 
1854 	  if (!supportable_dr_alignment)
1855 	    {
1856 	      do_peeling = false;
1857 	      break;
1858 	    }
1859 	}
1860 
1861       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1862         {
1863           stat = vect_verify_datarefs_alignment (loop_vinfo);
1864           if (!stat)
1865             do_peeling = false;
1866           else
1867 	    {
1868 	      body_cost_vec.release ();
1869 	      return stat;
1870 	    }
1871         }
1872 
1873       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1874       if (do_peeling)
1875         {
1876           unsigned max_allowed_peel
1877             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1878           if (max_allowed_peel != (unsigned)-1)
1879             {
1880               unsigned max_peel = npeel;
1881               if (max_peel == 0)
1882                 {
1883 		  gimple *dr_stmt = DR_STMT (dr0);
1884                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1885                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1886                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1887                 }
1888               if (max_peel > max_allowed_peel)
1889                 {
1890                   do_peeling = false;
1891                   if (dump_enabled_p ())
1892                     dump_printf_loc (MSG_NOTE, vect_location,
1893                         "Disable peeling, max peels reached: %d\n", max_peel);
1894                 }
1895             }
1896         }
1897 
1898       /* Cost model #2 - if peeling may result in a remaining loop not
1899 	 iterating enough to be vectorized then do not peel.  */
1900       if (do_peeling
1901 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1902 	{
1903 	  unsigned max_peel
1904 	    = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1905 	  if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1906 	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1907 	    do_peeling = false;
1908 	}
1909 
1910       if (do_peeling)
1911         {
1912           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1913              If the misalignment of DR_i is identical to that of dr0 then set
1914              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1915              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1916              by the peeling factor times the element size of DR_i (MOD the
1917              vectorization factor times the size).  Otherwise, the
1918              misalignment of DR_i must be set to unknown.  */
1919 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1920 	    if (dr != dr0)
1921 	      {
1922 		/* Strided accesses perform only component accesses, alignment
1923 		   is irrelevant for them.  */
1924 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
1925 		if (STMT_VINFO_STRIDED_P (stmt_info)
1926 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1927 		  continue;
1928 
1929 		vect_update_misalignment_for_peel (dr, dr0, npeel);
1930 	      }
1931 
1932           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1933           if (npeel)
1934             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1935           else
1936             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1937 	      = DR_MISALIGNMENT (dr0);
1938 	  SET_DR_MISALIGNMENT (dr0, 0);
1939 	  if (dump_enabled_p ())
1940             {
1941               dump_printf_loc (MSG_NOTE, vect_location,
1942                                "Alignment of access forced using peeling.\n");
1943               dump_printf_loc (MSG_NOTE, vect_location,
1944                                "Peeling for alignment will be applied.\n");
1945             }
1946 	  /* The inside-loop cost will be accounted for in vectorizable_load
1947 	     and vectorizable_store correctly with adjusted alignments.
1948 	     Drop the body_cst_vec on the floor here.  */
1949 	  body_cost_vec.release ();
1950 
1951 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
1952 	  gcc_assert (stat);
1953           return stat;
1954         }
1955     }
1956 
1957   body_cost_vec.release ();
1958 
1959   /* (2) Versioning to force alignment.  */
1960 
1961   /* Try versioning if:
1962      1) optimize loop for speed
1963      2) there is at least one unsupported misaligned data ref with an unknown
1964         misalignment, and
1965      3) all misaligned data refs with a known misalignment are supported, and
1966      4) the number of runtime alignment checks is within reason.  */
1967 
1968   do_versioning =
1969 	optimize_loop_nest_for_speed_p (loop)
1970 	&& (!loop->inner); /* FORNOW */
1971 
1972   if (do_versioning)
1973     {
1974       FOR_EACH_VEC_ELT (datarefs, i, dr)
1975         {
1976 	  stmt = DR_STMT (dr);
1977 	  stmt_info = vinfo_for_stmt (stmt);
1978 
1979 	  /* For interleaving, only the alignment of the first access
1980 	     matters.  */
1981 	  if (aligned_access_p (dr)
1982 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1983 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1984 	    continue;
1985 
1986 	  if (STMT_VINFO_STRIDED_P (stmt_info))
1987 	    {
1988 	      /* Strided loads perform only component accesses, alignment is
1989 		 irrelevant for them.  */
1990 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1991 		continue;
1992 	      do_versioning = false;
1993 	      break;
1994 	    }
1995 
1996 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1997 
1998           if (!supportable_dr_alignment)
1999             {
2000 	      gimple *stmt;
2001               int mask;
2002               tree vectype;
2003 
2004               if (known_alignment_for_access_p (dr)
2005                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2006                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2007                 {
2008                   do_versioning = false;
2009                   break;
2010                 }
2011 
2012               stmt = DR_STMT (dr);
2013               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2014               gcc_assert (vectype);
2015 
2016               /* The rightmost bits of an aligned address must be zeros.
2017                  Construct the mask needed for this test.  For example,
2018                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2019                  mask must be 15 = 0xf. */
2020               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2021 
2022               /* FORNOW: use the same mask to test all potentially unaligned
2023                  references in the loop.  The vectorizer currently supports
2024                  a single vector size, see the reference to
2025                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2026                  vectorization factor is computed.  */
2027               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2028                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2029               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2030               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2031 		      DR_STMT (dr));
2032             }
2033         }
2034 
2035       /* Versioning requires at least one misaligned data reference.  */
2036       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2037         do_versioning = false;
2038       else if (!do_versioning)
2039         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2040     }
2041 
2042   if (do_versioning)
2043     {
2044       vec<gimple *> may_misalign_stmts
2045         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2046       gimple *stmt;
2047 
2048       /* It can now be assumed that the data references in the statements
2049          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2050          of the loop being vectorized.  */
2051       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2052         {
2053           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2054           dr = STMT_VINFO_DATA_REF (stmt_info);
2055 	  SET_DR_MISALIGNMENT (dr, 0);
2056 	  if (dump_enabled_p ())
2057             dump_printf_loc (MSG_NOTE, vect_location,
2058                              "Alignment of access forced using versioning.\n");
2059         }
2060 
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location,
2063                          "Versioning for alignment will be applied.\n");
2064 
2065       /* Peeling and versioning can't be done together at this time.  */
2066       gcc_assert (! (do_peeling && do_versioning));
2067 
2068       stat = vect_verify_datarefs_alignment (loop_vinfo);
2069       gcc_assert (stat);
2070       return stat;
2071     }
2072 
2073   /* This point is reached if neither peeling nor versioning is being done.  */
2074   gcc_assert (! (do_peeling || do_versioning));
2075 
2076   stat = vect_verify_datarefs_alignment (loop_vinfo);
2077   return stat;
2078 }
2079 
2080 
2081 /* Function vect_find_same_alignment_drs.
2082 
2083    Update group and alignment relations according to the chosen
2084    vectorization factor.  */
2085 
2086 static void
2087 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2088 			      loop_vec_info loop_vinfo)
2089 {
2090   unsigned int i;
2091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2092   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2093   struct data_reference *dra = DDR_A (ddr);
2094   struct data_reference *drb = DDR_B (ddr);
2095   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2096   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2097   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2098   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2099   lambda_vector dist_v;
2100   unsigned int loop_depth;
2101 
2102   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2103     return;
2104 
2105   if (dra == drb)
2106     return;
2107 
2108   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2109     return;
2110 
2111   /* Loop-based vectorization and known data dependence.  */
2112   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2113     return;
2114 
2115   /* Data-dependence analysis reports a distance vector of zero
2116      for data-references that overlap only in the first iteration
2117      but have different sign step (see PR45764).
2118      So as a sanity check require equal DR_STEP.  */
2119   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2120     return;
2121 
2122   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2123   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2124     {
2125       int dist = dist_v[loop_depth];
2126 
2127       if (dump_enabled_p ())
2128 	dump_printf_loc (MSG_NOTE, vect_location,
2129 	                 "dependence distance  = %d.\n", dist);
2130 
2131       /* Same loop iteration.  */
2132       if (dist == 0
2133 	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
2134 	{
2135 	  /* Two references with distance zero have the same alignment.  */
2136 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2137 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2138 	  if (dump_enabled_p ())
2139 	    {
2140 	      dump_printf_loc (MSG_NOTE, vect_location,
2141 	                       "accesses have the same alignment.\n");
2142 	      dump_printf (MSG_NOTE,
2143 	                   "dependence distance modulo vf == 0 between ");
2144 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2145 	      dump_printf (MSG_NOTE,  " and ");
2146 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2147 	      dump_printf (MSG_NOTE, "\n");
2148 	    }
2149 	}
2150     }
2151 }
2152 
2153 
2154 /* Function vect_analyze_data_refs_alignment
2155 
2156    Analyze the alignment of the data-references in the loop.
2157    Return FALSE if a data reference is found that cannot be vectorized.  */
2158 
2159 bool
2160 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2161 {
2162   if (dump_enabled_p ())
2163     dump_printf_loc (MSG_NOTE, vect_location,
2164                      "=== vect_analyze_data_refs_alignment ===\n");
2165 
2166   /* Mark groups of data references with same alignment using
2167      data dependence information.  */
2168   vec<ddr_p> ddrs = vinfo->ddrs;
2169   struct data_dependence_relation *ddr;
2170   unsigned int i;
2171 
2172   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2173     vect_find_same_alignment_drs (ddr, vinfo);
2174 
2175   vec<data_reference_p> datarefs = vinfo->datarefs;
2176   struct data_reference *dr;
2177 
2178   FOR_EACH_VEC_ELT (datarefs, i, dr)
2179     {
2180       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2181       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2182 	  && !vect_compute_data_ref_alignment (dr))
2183 	{
2184 	  /* Strided accesses perform only component accesses, misalignment
2185 	     information is irrelevant for them.  */
2186 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2187 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2188 	    continue;
2189 
2190 	  if (dump_enabled_p ())
2191 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192 			     "not vectorized: can't calculate alignment "
2193 			     "for data ref.\n");
2194 
2195 	  return false;
2196 	}
2197     }
2198 
2199   return true;
2200 }
2201 
2202 
2203 /* Analyze alignment of DRs of stmts in NODE.  */
2204 
2205 static bool
2206 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2207 {
2208   /* We vectorize from the first scalar stmt in the node unless
2209      the node is permuted in which case we start from the first
2210      element in the group.  */
2211   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2212   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2213   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2214     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2215 
2216   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2217   if (! vect_compute_data_ref_alignment (dr)
2218       /* For creating the data-ref pointer we need alignment of the
2219 	 first element anyway.  */
2220       || (dr != first_dr
2221 	  && ! vect_compute_data_ref_alignment (first_dr))
2222       || ! verify_data_ref_alignment (dr))
2223     {
2224       if (dump_enabled_p ())
2225 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 			 "not vectorized: bad data alignment in basic "
2227 			 "block.\n");
2228       return false;
2229     }
2230 
2231   return true;
2232 }
2233 
2234 /* Function vect_slp_analyze_instance_alignment
2235 
2236    Analyze the alignment of the data-references in the SLP instance.
2237    Return FALSE if a data reference is found that cannot be vectorized.  */
2238 
2239 bool
2240 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2241 {
2242   if (dump_enabled_p ())
2243     dump_printf_loc (MSG_NOTE, vect_location,
2244                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2245 
2246   slp_tree node;
2247   unsigned i;
2248   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2249     if (! vect_slp_analyze_and_verify_node_alignment (node))
2250       return false;
2251 
2252   node = SLP_INSTANCE_TREE (instance);
2253   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2254       && ! vect_slp_analyze_and_verify_node_alignment
2255 	     (SLP_INSTANCE_TREE (instance)))
2256     return false;
2257 
2258   return true;
2259 }
2260 
2261 
2262 /* Analyze groups of accesses: check that DR belongs to a group of
2263    accesses of legal size, step, etc.  Detect gaps, single element
2264    interleaving, and other special cases. Set grouped access info.
2265    Collect groups of strided stores for further use in SLP analysis.
2266    Worker for vect_analyze_group_access.  */
2267 
2268 static bool
2269 vect_analyze_group_access_1 (struct data_reference *dr)
2270 {
2271   tree step = DR_STEP (dr);
2272   tree scalar_type = TREE_TYPE (DR_REF (dr));
2273   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2274   gimple *stmt = DR_STMT (dr);
2275   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2276   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2277   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2278   HOST_WIDE_INT dr_step = -1;
2279   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2280   bool slp_impossible = false;
2281 
2282   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2283      size of the interleaving group (including gaps).  */
2284   if (tree_fits_shwi_p (step))
2285     {
2286       dr_step = tree_to_shwi (step);
2287       /* Check that STEP is a multiple of type size.  Otherwise there is
2288          a non-element-sized gap at the end of the group which we
2289 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2290 	 ???  As we can handle non-constant step fine here we should
2291 	 simply remove uses of GROUP_GAP between the last and first
2292 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2293 	 simply not include that gap.  */
2294       if ((dr_step % type_size) != 0)
2295 	{
2296 	  if (dump_enabled_p ())
2297 	    {
2298 	      dump_printf_loc (MSG_NOTE, vect_location,
2299 	                       "Step ");
2300 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2301 	      dump_printf (MSG_NOTE,
2302 			   " is not a multiple of the element size for ");
2303 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2304 	      dump_printf (MSG_NOTE, "\n");
2305 	    }
2306 	  return false;
2307 	}
2308       groupsize = absu_hwi (dr_step) / type_size;
2309     }
2310   else
2311     groupsize = 0;
2312 
2313   /* Not consecutive access is possible only if it is a part of interleaving.  */
2314   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2315     {
2316       /* Check if it this DR is a part of interleaving, and is a single
2317 	 element of the group that is accessed in the loop.  */
2318 
2319       /* Gaps are supported only for loads. STEP must be a multiple of the type
2320 	 size.  The size of the group must be a power of 2.  */
2321       if (DR_IS_READ (dr)
2322 	  && (dr_step % type_size) == 0
2323 	  && groupsize > 0
2324 	  && pow2p_hwi (groupsize))
2325 	{
2326 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2327 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2328 	  GROUP_GAP (stmt_info) = groupsize - 1;
2329 	  if (dump_enabled_p ())
2330 	    {
2331 	      dump_printf_loc (MSG_NOTE, vect_location,
2332 	                       "Detected single element interleaving ");
2333 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2334 	      dump_printf (MSG_NOTE, " step ");
2335 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2336 	      dump_printf (MSG_NOTE, "\n");
2337 	    }
2338 
2339 	  return true;
2340 	}
2341 
2342       if (dump_enabled_p ())
2343         {
2344  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 	                   "not consecutive access ");
2346 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2347         }
2348 
2349       if (bb_vinfo)
2350         {
2351           /* Mark the statement as unvectorizable.  */
2352           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2353           return true;
2354         }
2355 
2356       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2357       STMT_VINFO_STRIDED_P (stmt_info) = true;
2358       return true;
2359     }
2360 
2361   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2362     {
2363       /* First stmt in the interleaving chain. Check the chain.  */
2364       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2365       struct data_reference *data_ref = dr;
2366       unsigned int count = 1;
2367       tree prev_init = DR_INIT (data_ref);
2368       gimple *prev = stmt;
2369       HOST_WIDE_INT diff, gaps = 0;
2370 
2371       while (next)
2372         {
2373           /* Skip same data-refs.  In case that two or more stmts share
2374              data-ref (supported only for loads), we vectorize only the first
2375              stmt, and the rest get their vectorized loads from the first
2376              one.  */
2377           if (!tree_int_cst_compare (DR_INIT (data_ref),
2378                                      DR_INIT (STMT_VINFO_DATA_REF (
2379 						   vinfo_for_stmt (next)))))
2380             {
2381               if (DR_IS_WRITE (data_ref))
2382                 {
2383                   if (dump_enabled_p ())
2384                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2385                                      "Two store stmts share the same dr.\n");
2386                   return false;
2387                 }
2388 
2389 	      if (dump_enabled_p ())
2390 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2391 				 "Two or more load stmts share the same dr.\n");
2392 
2393               /* For load use the same data-ref load.  */
2394               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2395 
2396               prev = next;
2397               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2398               continue;
2399             }
2400 
2401           prev = next;
2402           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2403 
2404 	  /* All group members have the same STEP by construction.  */
2405 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2406 
2407           /* Check that the distance between two accesses is equal to the type
2408              size. Otherwise, we have gaps.  */
2409           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2410                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2411 	  if (diff != 1)
2412 	    {
2413 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2414 	      slp_impossible = true;
2415 	      if (DR_IS_WRITE (data_ref))
2416 		{
2417                   if (dump_enabled_p ())
2418                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2419                                      "interleaved store with gaps\n");
2420 		  return false;
2421 		}
2422 
2423               gaps += diff - 1;
2424 	    }
2425 
2426 	  last_accessed_element += diff;
2427 
2428           /* Store the gap from the previous member of the group. If there is no
2429              gap in the access, GROUP_GAP is always 1.  */
2430           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2431 
2432           prev_init = DR_INIT (data_ref);
2433           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2434           /* Count the number of data-refs in the chain.  */
2435           count++;
2436         }
2437 
2438       if (groupsize == 0)
2439         groupsize = count + gaps;
2440 
2441       /* This could be UINT_MAX but as we are generating code in a very
2442          inefficient way we have to cap earlier.  See PR78699 for example.  */
2443       if (groupsize > 4096)
2444 	{
2445 	  if (dump_enabled_p ())
2446 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2447 			     "group is too large\n");
2448 	  return false;
2449 	}
2450 
2451       /* Check that the size of the interleaving is equal to count for stores,
2452          i.e., that there are no gaps.  */
2453       if (groupsize != count
2454 	  && !DR_IS_READ (dr))
2455         {
2456 	  if (dump_enabled_p ())
2457 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 			     "interleaved store with gaps\n");
2459 	  return false;
2460 	}
2461 
2462       /* If there is a gap after the last load in the group it is the
2463 	 difference between the groupsize and the last accessed
2464 	 element.
2465 	 When there is no gap, this difference should be 0.  */
2466       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2467 
2468       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2469       if (dump_enabled_p ())
2470 	{
2471 	  dump_printf_loc (MSG_NOTE, vect_location,
2472 			   "Detected interleaving ");
2473 	  if (DR_IS_READ (dr))
2474 	    dump_printf (MSG_NOTE, "load ");
2475 	  else
2476 	    dump_printf (MSG_NOTE, "store ");
2477 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2478 		       (unsigned)groupsize);
2479 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2480 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2481 	    dump_printf_loc (MSG_NOTE, vect_location,
2482 			     "There is a gap of %u elements after the group\n",
2483 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2484 	}
2485 
2486       /* SLP: create an SLP data structure for every interleaving group of
2487 	 stores for further analysis in vect_analyse_slp.  */
2488       if (DR_IS_WRITE (dr) && !slp_impossible)
2489         {
2490           if (loop_vinfo)
2491             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2492           if (bb_vinfo)
2493             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2494         }
2495     }
2496 
2497   return true;
2498 }
2499 
2500 /* Analyze groups of accesses: check that DR belongs to a group of
2501    accesses of legal size, step, etc.  Detect gaps, single element
2502    interleaving, and other special cases. Set grouped access info.
2503    Collect groups of strided stores for further use in SLP analysis.  */
2504 
2505 static bool
2506 vect_analyze_group_access (struct data_reference *dr)
2507 {
2508   if (!vect_analyze_group_access_1 (dr))
2509     {
2510       /* Dissolve the group if present.  */
2511       gimple *next;
2512       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2513       while (stmt)
2514 	{
2515 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2516 	  next = GROUP_NEXT_ELEMENT (vinfo);
2517 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2518 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2519 	  stmt = next;
2520 	}
2521       return false;
2522     }
2523   return true;
2524 }
2525 
2526 /* Analyze the access pattern of the data-reference DR.
2527    In case of non-consecutive accesses call vect_analyze_group_access() to
2528    analyze groups of accesses.  */
2529 
2530 static bool
2531 vect_analyze_data_ref_access (struct data_reference *dr)
2532 {
2533   tree step = DR_STEP (dr);
2534   tree scalar_type = TREE_TYPE (DR_REF (dr));
2535   gimple *stmt = DR_STMT (dr);
2536   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2538   struct loop *loop = NULL;
2539 
2540   if (loop_vinfo)
2541     loop = LOOP_VINFO_LOOP (loop_vinfo);
2542 
2543   if (loop_vinfo && !step)
2544     {
2545       if (dump_enabled_p ())
2546 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2547 	                 "bad data-ref access in loop\n");
2548       return false;
2549     }
2550 
2551   /* Allow loads with zero step in inner-loop vectorization.  */
2552   if (loop_vinfo && integer_zerop (step))
2553     {
2554       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2555       if (!nested_in_vect_loop_p (loop, stmt))
2556 	return DR_IS_READ (dr);
2557       /* Allow references with zero step for outer loops marked
2558 	 with pragma omp simd only - it guarantees absence of
2559 	 loop-carried dependencies between inner loop iterations.  */
2560       if (!loop->force_vectorize || loop->safelen < 2)
2561 	{
2562 	  if (dump_enabled_p ())
2563 	    dump_printf_loc (MSG_NOTE, vect_location,
2564 			     "zero step in inner loop of nest\n");
2565 	  return false;
2566 	}
2567     }
2568 
2569   if (loop && nested_in_vect_loop_p (loop, stmt))
2570     {
2571       /* Interleaved accesses are not yet supported within outer-loop
2572         vectorization for references in the inner-loop.  */
2573       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2574 
2575       /* For the rest of the analysis we use the outer-loop step.  */
2576       step = STMT_VINFO_DR_STEP (stmt_info);
2577       if (integer_zerop (step))
2578 	{
2579 	  if (dump_enabled_p ())
2580 	    dump_printf_loc (MSG_NOTE, vect_location,
2581 	                     "zero step in outer loop.\n");
2582 	  return DR_IS_READ (dr);
2583 	}
2584     }
2585 
2586   /* Consecutive?  */
2587   if (TREE_CODE (step) == INTEGER_CST)
2588     {
2589       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2590       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2591 	  || (dr_step < 0
2592 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2593 	{
2594 	  /* Mark that it is not interleaving.  */
2595 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2596 	  return true;
2597 	}
2598     }
2599 
2600   if (loop && nested_in_vect_loop_p (loop, stmt))
2601     {
2602       if (dump_enabled_p ())
2603 	dump_printf_loc (MSG_NOTE, vect_location,
2604 	                 "grouped access in outer loop.\n");
2605       return false;
2606     }
2607 
2608 
2609   /* Assume this is a DR handled by non-constant strided load case.  */
2610   if (TREE_CODE (step) != INTEGER_CST)
2611     return (STMT_VINFO_STRIDED_P (stmt_info)
2612 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2613 		|| vect_analyze_group_access (dr)));
2614 
2615   /* Not consecutive access - check if it's a part of interleaving group.  */
2616   return vect_analyze_group_access (dr);
2617 }
2618 
2619 
2620 
2621 /*  A helper function used in the comparator function to sort data
2622     references.  T1 and T2 are two data references to be compared.
2623     The function returns -1, 0, or 1.  */
2624 
2625 static int
2626 compare_tree (tree t1, tree t2)
2627 {
2628   int i, cmp;
2629   enum tree_code code;
2630   char tclass;
2631 
2632   if (t1 == t2)
2633     return 0;
2634   if (t1 == NULL)
2635     return -1;
2636   if (t2 == NULL)
2637     return 1;
2638 
2639   STRIP_NOPS (t1);
2640   STRIP_NOPS (t2);
2641 
2642   if (TREE_CODE (t1) != TREE_CODE (t2))
2643     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2644 
2645   code = TREE_CODE (t1);
2646   switch (code)
2647     {
2648     /* For const values, we can just use hash values for comparisons.  */
2649     case INTEGER_CST:
2650     case REAL_CST:
2651     case FIXED_CST:
2652     case STRING_CST:
2653     case COMPLEX_CST:
2654     case VECTOR_CST:
2655       {
2656 	hashval_t h1 = iterative_hash_expr (t1, 0);
2657 	hashval_t h2 = iterative_hash_expr (t2, 0);
2658 	if (h1 != h2)
2659 	  return h1 < h2 ? -1 : 1;
2660 	break;
2661       }
2662 
2663     case SSA_NAME:
2664       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2665       if (cmp != 0)
2666 	return cmp;
2667 
2668       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2669 	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2670       break;
2671 
2672     default:
2673       tclass = TREE_CODE_CLASS (code);
2674 
2675       /* For var-decl, we could compare their UIDs.  */
2676       if (tclass == tcc_declaration)
2677 	{
2678 	  if (DECL_UID (t1) != DECL_UID (t2))
2679 	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2680 	  break;
2681 	}
2682 
2683       /* For expressions with operands, compare their operands recursively.  */
2684       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2685 	{
2686 	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2687 	  if (cmp != 0)
2688 	    return cmp;
2689 	}
2690     }
2691 
2692   return 0;
2693 }
2694 
2695 
2696 /* Compare two data-references DRA and DRB to group them into chunks
2697    suitable for grouping.  */
2698 
2699 static int
2700 dr_group_sort_cmp (const void *dra_, const void *drb_)
2701 {
2702   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2703   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2704   int cmp;
2705 
2706   /* Stabilize sort.  */
2707   if (dra == drb)
2708     return 0;
2709 
2710   /* DRs in different loops never belong to the same group.  */
2711   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2712   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2713   if (loopa != loopb)
2714     return loopa->num < loopb->num ? -1 : 1;
2715 
2716   /* Ordering of DRs according to base.  */
2717   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2718     {
2719       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2720       if (cmp != 0)
2721         return cmp;
2722     }
2723 
2724   /* And according to DR_OFFSET.  */
2725   if (!dr_equal_offsets_p (dra, drb))
2726     {
2727       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2728       if (cmp != 0)
2729         return cmp;
2730     }
2731 
2732   /* Put reads before writes.  */
2733   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2734     return DR_IS_READ (dra) ? -1 : 1;
2735 
2736   /* Then sort after access size.  */
2737   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2738 			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2739     {
2740       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2741                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2742       if (cmp != 0)
2743         return cmp;
2744     }
2745 
2746   /* And after step.  */
2747   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2748     {
2749       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2750       if (cmp != 0)
2751         return cmp;
2752     }
2753 
2754   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2755   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2756   if (cmp == 0)
2757     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2758   return cmp;
2759 }
2760 
2761 /* Function vect_analyze_data_ref_accesses.
2762 
2763    Analyze the access pattern of all the data references in the loop.
2764 
2765    FORNOW: the only access pattern that is considered vectorizable is a
2766 	   simple step 1 (consecutive) access.
2767 
2768    FORNOW: handle only arrays and pointer accesses.  */
2769 
2770 bool
2771 vect_analyze_data_ref_accesses (vec_info *vinfo)
2772 {
2773   unsigned int i;
2774   vec<data_reference_p> datarefs = vinfo->datarefs;
2775   struct data_reference *dr;
2776 
2777   if (dump_enabled_p ())
2778     dump_printf_loc (MSG_NOTE, vect_location,
2779                      "=== vect_analyze_data_ref_accesses ===\n");
2780 
2781   if (datarefs.is_empty ())
2782     return true;
2783 
2784   /* Sort the array of datarefs to make building the interleaving chains
2785      linear.  Don't modify the original vector's order, it is needed for
2786      determining what dependencies are reversed.  */
2787   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2788   datarefs_copy.qsort (dr_group_sort_cmp);
2789 
2790   /* Build the interleaving chains.  */
2791   for (i = 0; i < datarefs_copy.length () - 1;)
2792     {
2793       data_reference_p dra = datarefs_copy[i];
2794       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2795       stmt_vec_info lastinfo = NULL;
2796       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2797 	{
2798 	  ++i;
2799 	  continue;
2800 	}
2801       for (i = i + 1; i < datarefs_copy.length (); ++i)
2802 	{
2803 	  data_reference_p drb = datarefs_copy[i];
2804 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2805 	  if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2806 	    break;
2807 
2808 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2809 	     accesses, same accesses) can lead to a group to be artificially
2810 	     split here as we don't just skip over those.  If it really
2811 	     matters we can push those to a worklist and re-iterate
2812 	     over them.  The we can just skip ahead to the next DR here.  */
2813 
2814 	  /* DRs in a different loop should not be put into the same
2815 	     interleaving group.  */
2816 	  if (gimple_bb (DR_STMT (dra))->loop_father
2817 	      != gimple_bb (DR_STMT (drb))->loop_father)
2818 	    break;
2819 
2820 	  /* Check that the data-refs have same first location (except init)
2821 	     and they are both either store or load (not load and store,
2822 	     not masked loads or stores).  */
2823 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2824 	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2825 				   DR_BASE_ADDRESS (drb), 0)
2826 	      || !dr_equal_offsets_p (dra, drb)
2827 	      || !gimple_assign_single_p (DR_STMT (dra))
2828 	      || !gimple_assign_single_p (DR_STMT (drb)))
2829 	    break;
2830 
2831 	  /* Check that the data-refs have the same constant size.  */
2832 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2833 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2834 	  if (!tree_fits_uhwi_p (sza)
2835 	      || !tree_fits_uhwi_p (szb)
2836 	      || !tree_int_cst_equal (sza, szb))
2837 	    break;
2838 
2839 	  /* Check that the data-refs have the same step.  */
2840 	  if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2841 	    break;
2842 
2843 	  /* Do not place the same access in the interleaving chain twice.  */
2844 	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2845 	    break;
2846 
2847 	  /* Check the types are compatible.
2848 	     ???  We don't distinguish this during sorting.  */
2849 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2850 				   TREE_TYPE (DR_REF (drb))))
2851 	    break;
2852 
2853 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2854 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2855 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2856 	  gcc_assert (init_a <= init_b);
2857 
2858 	  /* If init_b == init_a + the size of the type * k, we have an
2859 	     interleaving, and DRA is accessed before DRB.  */
2860 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2861 	  if (type_size_a == 0
2862 	      || (init_b - init_a) % type_size_a != 0)
2863 	    break;
2864 
2865 	  /* If we have a store, the accesses are adjacent.  This splits
2866 	     groups into chunks we support (we don't support vectorization
2867 	     of stores with gaps).  */
2868 	  if (!DR_IS_READ (dra)
2869 	      && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2870 					     (DR_INIT (datarefs_copy[i-1]))
2871 		  != type_size_a))
2872 	    break;
2873 
2874 	  /* If the step (if not zero or non-constant) is greater than the
2875 	     difference between data-refs' inits this splits groups into
2876 	     suitable sizes.  */
2877 	  if (tree_fits_shwi_p (DR_STEP (dra)))
2878 	    {
2879 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2880 	      if (step != 0 && step <= (init_b - init_a))
2881 		break;
2882 	    }
2883 
2884 	  if (dump_enabled_p ())
2885 	    {
2886 	      dump_printf_loc (MSG_NOTE, vect_location,
2887 			       "Detected interleaving ");
2888 	      if (DR_IS_READ (dra))
2889 		dump_printf (MSG_NOTE, "load ");
2890 	      else
2891 		dump_printf (MSG_NOTE, "store ");
2892 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2893 	      dump_printf (MSG_NOTE,  " and ");
2894 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2895 	      dump_printf (MSG_NOTE, "\n");
2896 	    }
2897 
2898 	  /* Link the found element into the group list.  */
2899 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2900 	    {
2901 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2902 	      lastinfo = stmtinfo_a;
2903 	    }
2904 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2905 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2906 	  lastinfo = stmtinfo_b;
2907 	}
2908     }
2909 
2910   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2911     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2912         && !vect_analyze_data_ref_access (dr))
2913       {
2914 	if (dump_enabled_p ())
2915 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2916 	                   "not vectorized: complicated access pattern.\n");
2917 
2918         if (is_a <bb_vec_info> (vinfo))
2919           {
2920             /* Mark the statement as not vectorizable.  */
2921             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2922             continue;
2923           }
2924         else
2925 	  {
2926 	    datarefs_copy.release ();
2927 	    return false;
2928 	  }
2929       }
2930 
2931   datarefs_copy.release ();
2932   return true;
2933 }
2934 
2935 
2936 /* Operator == between two dr_with_seg_len objects.
2937 
2938    This equality operator is used to make sure two data refs
2939    are the same one so that we will consider to combine the
2940    aliasing checks of those two pairs of data dependent data
2941    refs.  */
2942 
2943 static bool
2944 operator == (const dr_with_seg_len& d1,
2945 	     const dr_with_seg_len& d2)
2946 {
2947   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2948 			  DR_BASE_ADDRESS (d2.dr), 0)
2949 	   && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2950 	   && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2951 	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2952 }
2953 
2954 /* Function comp_dr_with_seg_len_pair.
2955 
2956    Comparison function for sorting objects of dr_with_seg_len_pair_t
2957    so that we can combine aliasing checks in one scan.  */
2958 
2959 static int
2960 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2961 {
2962   const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2963   const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2964   const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2965   const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2966 
2967   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2968      if a and c have the same basic address snd step, and b and d have the same
2969      address and step.  Therefore, if any a&c or b&d don't have the same address
2970      and step, we don't care the order of those two pairs after sorting.  */
2971   int comp_res;
2972 
2973   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2974 				DR_BASE_ADDRESS (b1.dr))) != 0)
2975     return comp_res;
2976   if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2977 				DR_BASE_ADDRESS (b2.dr))) != 0)
2978     return comp_res;
2979   if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2980     return comp_res;
2981   if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2982     return comp_res;
2983   if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2984     return comp_res;
2985   if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2986     return comp_res;
2987   if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2988     return comp_res;
2989   if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2990     return comp_res;
2991 
2992   return 0;
2993 }
2994 
2995 /* Function vect_vfa_segment_size.
2996 
2997    Create an expression that computes the size of segment
2998    that will be accessed for a data reference.  The functions takes into
2999    account that realignment loads may access one more vector.
3000 
3001    Input:
3002      DR: The data reference.
3003      LENGTH_FACTOR: segment length to consider.
3004 
3005    Return an expression whose value is the size of segment which will be
3006    accessed by DR.  */
3007 
3008 static tree
3009 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3010 {
3011   tree segment_length;
3012 
3013   if (integer_zerop (DR_STEP (dr)))
3014     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3015   else
3016     segment_length = size_binop (MULT_EXPR,
3017 				 fold_convert (sizetype, DR_STEP (dr)),
3018 				 fold_convert (sizetype, length_factor));
3019 
3020   if (vect_supportable_dr_alignment (dr, false)
3021 	== dr_explicit_realign_optimized)
3022     {
3023       tree vector_size = TYPE_SIZE_UNIT
3024 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3025 
3026       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3027     }
3028   return segment_length;
3029 }
3030 
3031 /* Function vect_no_alias_p.
3032 
3033    Given data references A and B with equal base and offset, the alias
3034    relation can be decided at compilation time, return TRUE if they do
3035    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3036    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3037    respectively.  */
3038 
3039 static bool
3040 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3041                  tree segment_length_a, tree segment_length_b)
3042 {
3043   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3044 	      && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3045   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3046     return false;
3047 
3048   tree seg_a_min = DR_INIT (a);
3049   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3050 				seg_a_min, segment_length_a);
3051   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3052      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3053      [a, a+12) */
3054   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3055     {
3056       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3057       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3058 			       seg_a_max, unit_size);
3059       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3060 			       DR_INIT (a), unit_size);
3061     }
3062   tree seg_b_min = DR_INIT (b);
3063   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3064 				seg_b_min, segment_length_b);
3065   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3066     {
3067       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3068       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3069 			       seg_b_max, unit_size);
3070       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3071 			       DR_INIT (b), unit_size);
3072     }
3073 
3074   if (tree_int_cst_le (seg_a_max, seg_b_min)
3075       || tree_int_cst_le (seg_b_max, seg_a_min))
3076     return true;
3077 
3078   return false;
3079 }
3080 
3081 /* Function vect_prune_runtime_alias_test_list.
3082 
3083    Prune a list of ddrs to be tested at run-time by versioning for alias.
3084    Merge several alias checks into one if possible.
3085    Return FALSE if resulting list of ddrs is longer then allowed by
3086    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3087 
3088 bool
3089 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3090 {
3091   vec<ddr_p> may_alias_ddrs =
3092     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3093   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3094     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3095   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3096   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3097 
3098   ddr_p ddr;
3099   unsigned int i;
3100   tree length_factor;
3101 
3102   if (dump_enabled_p ())
3103     dump_printf_loc (MSG_NOTE, vect_location,
3104                      "=== vect_prune_runtime_alias_test_list ===\n");
3105 
3106   if (may_alias_ddrs.is_empty ())
3107     return true;
3108 
3109   /* Basically, for each pair of dependent data refs store_ptr_0
3110      and load_ptr_0, we create an expression:
3111 
3112      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3113      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3114 
3115      for aliasing checks.  However, in some cases we can decrease
3116      the number of checks by combining two checks into one.  For
3117      example, suppose we have another pair of data refs store_ptr_0
3118      and load_ptr_1, and if the following condition is satisfied:
3119 
3120      load_ptr_0 < load_ptr_1  &&
3121      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3122 
3123      (this condition means, in each iteration of vectorized loop,
3124      the accessed memory of store_ptr_0 cannot be between the memory
3125      of load_ptr_0 and load_ptr_1.)
3126 
3127      we then can use only the following expression to finish the
3128      alising checks between store_ptr_0 & load_ptr_0 and
3129      store_ptr_0 & load_ptr_1:
3130 
3131      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3132      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3133 
3134      Note that we only consider that load_ptr_0 and load_ptr_1 have the
3135      same basic address.  */
3136 
3137   comp_alias_ddrs.create (may_alias_ddrs.length ());
3138 
3139   /* First, we collect all data ref pairs for aliasing checks.  */
3140   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3141     {
3142       int comp_res;
3143       struct data_reference *dr_a, *dr_b;
3144       gimple *dr_group_first_a, *dr_group_first_b;
3145       tree segment_length_a, segment_length_b;
3146       gimple *stmt_a, *stmt_b;
3147 
3148       dr_a = DDR_A (ddr);
3149       stmt_a = DR_STMT (DDR_A (ddr));
3150       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3151       if (dr_group_first_a)
3152 	{
3153 	  stmt_a = dr_group_first_a;
3154 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3155 	}
3156 
3157       dr_b = DDR_B (ddr);
3158       stmt_b = DR_STMT (DDR_B (ddr));
3159       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3160       if (dr_group_first_b)
3161 	{
3162 	  stmt_b = dr_group_first_b;
3163 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3164 	}
3165 
3166       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3167 	length_factor = scalar_loop_iters;
3168       else
3169 	length_factor = size_int (vect_factor);
3170       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3171       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3172 
3173       comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3174       if (comp_res == 0)
3175 	comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3176 
3177       /* Alias is known at compilation time.  */
3178       if (comp_res == 0
3179 	  && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3180 	  && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3181 	  && TREE_CODE (segment_length_a) == INTEGER_CST
3182 	  && TREE_CODE (segment_length_b) == INTEGER_CST)
3183 	{
3184 	  if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3185 	    continue;
3186 
3187 	  if (dump_enabled_p ())
3188 	    dump_printf_loc (MSG_NOTE, vect_location,
3189 			     "not vectorized: compilation time alias.\n");
3190 
3191 	  return false;
3192 	}
3193 
3194       dr_with_seg_len_pair_t dr_with_seg_len_pair
3195 	  (dr_with_seg_len (dr_a, segment_length_a),
3196 	   dr_with_seg_len (dr_b, segment_length_b));
3197 
3198       /* Canonicalize pairs by sorting the two DR members.  */
3199       if (comp_res > 0)
3200 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3201 
3202       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3203     }
3204 
3205   /* Second, we sort the collected data ref pairs so that we can scan
3206      them once to combine all possible aliasing checks.  */
3207   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3208 
3209   /* Third, we scan the sorted dr pairs and check if we can combine
3210      alias checks of two neighboring dr pairs.  */
3211   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3212     {
3213       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
3214       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3215 		      *dr_b1 = &comp_alias_ddrs[i-1].second,
3216 		      *dr_a2 = &comp_alias_ddrs[i].first,
3217 		      *dr_b2 = &comp_alias_ddrs[i].second;
3218 
3219       /* Remove duplicate data ref pairs.  */
3220       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3221 	{
3222 	  if (dump_enabled_p ())
3223 	    {
3224 	      dump_printf_loc (MSG_NOTE, vect_location,
3225 			       "found equal ranges ");
3226 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3227 				 DR_REF (dr_a1->dr));
3228 	      dump_printf (MSG_NOTE,  ", ");
3229 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3230 				 DR_REF (dr_b1->dr));
3231 	      dump_printf (MSG_NOTE,  " and ");
3232 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3233 				 DR_REF (dr_a2->dr));
3234 	      dump_printf (MSG_NOTE,  ", ");
3235 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3236 				 DR_REF (dr_b2->dr));
3237 	      dump_printf (MSG_NOTE, "\n");
3238 	    }
3239 
3240 	  comp_alias_ddrs.ordered_remove (i--);
3241 	  continue;
3242 	}
3243 
3244       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3245 	{
3246 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3247 	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
3248 	  if (*dr_a1 == *dr_a2)
3249 	    {
3250 	      std::swap (dr_a1, dr_b1);
3251 	      std::swap (dr_a2, dr_b2);
3252 	    }
3253 
3254 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3255 				DR_BASE_ADDRESS (dr_a2->dr), 0)
3256 	      || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3257 				   DR_OFFSET (dr_a2->dr), 0)
3258 	      || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3259 	      || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3260 	    continue;
3261 
3262 	  /* Make sure dr_a1 starts left of dr_a2.  */
3263 	  if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3264 	    std::swap (*dr_a1, *dr_a2);
3265 
3266 	  bool do_remove = false;
3267 	  unsigned HOST_WIDE_INT diff
3268 	    = (tree_to_shwi (DR_INIT (dr_a2->dr))
3269                - tree_to_shwi (DR_INIT (dr_a1->dr)));
3270 
3271 	  /* If the left segment does not extend beyond the start of the
3272 	     right segment the new segment length is that of the right
3273 	     plus the segment distance.  */
3274 	  if (tree_fits_uhwi_p (dr_a1->seg_len)
3275 	      && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3276 	    {
3277 	      dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3278 					   size_int (diff));
3279 	      do_remove = true;
3280 	    }
3281 	  /* Generally the new segment length is the maximum of the
3282 	     left segment size and the right segment size plus the distance.
3283 	     ???  We can also build tree MAX_EXPR here but it's not clear this
3284 	     is profitable.  */
3285 	  else if (tree_fits_uhwi_p (dr_a1->seg_len)
3286 		   && tree_fits_uhwi_p (dr_a2->seg_len))
3287 	    {
3288 	      unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3289 	      unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3290 	      dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3291 	      do_remove = true;
3292 	    }
3293 	  /* Now we check if the following condition is satisfied:
3294 
3295 	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3296 
3297 	     where DIFF = DR_A2_INIT - DR_A1_INIT.  However,
3298 	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3299 	     have to make a best estimation.  We can get the minimum value
3300 	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3301 	     then either of the following two conditions can guarantee the
3302 	     one above:
3303 
3304 	     1: DIFF <= MIN_SEG_LEN_B
3305 	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
3306 	  else
3307 	    {
3308 	      unsigned HOST_WIDE_INT min_seg_len_b
3309 		= (tree_fits_uhwi_p (dr_b1->seg_len)
3310 		   ? tree_to_uhwi (dr_b1->seg_len)
3311 		   : vect_factor);
3312 
3313 	      if (diff <= min_seg_len_b
3314 		  || (tree_fits_uhwi_p (dr_a1->seg_len)
3315 		      && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3316 		{
3317 		  dr_a1->seg_len = size_binop (PLUS_EXPR,
3318 					       dr_a2->seg_len, size_int (diff));
3319 		  do_remove = true;
3320 		}
3321 	    }
3322 
3323 	  if (do_remove)
3324 	    {
3325 	      if (dump_enabled_p ())
3326 		{
3327 		  dump_printf_loc (MSG_NOTE, vect_location,
3328 				   "merging ranges for ");
3329 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3330 		  dump_printf (MSG_NOTE,  ", ");
3331 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3332 		  dump_printf (MSG_NOTE,  " and ");
3333 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3334 		  dump_printf (MSG_NOTE,  ", ");
3335 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3336 		  dump_printf (MSG_NOTE, "\n");
3337 		}
3338 	      comp_alias_ddrs.ordered_remove (i--);
3339 	    }
3340 	}
3341     }
3342 
3343   dump_printf_loc (MSG_NOTE, vect_location,
3344 		   "improved number of alias checks from %d to %d\n",
3345 		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
3346   if ((int) comp_alias_ddrs.length () >
3347       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3348     {
3349       if (dump_enabled_p ())
3350 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3351 			 "number of versioning for alias "
3352 			 "run-time tests exceeds %d "
3353 			 "(--param vect-max-version-for-alias-checks)\n",
3354 			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3355       return false;
3356     }
3357 
3358   /* All alias checks have been resolved at compilation time.  */
3359   if (!comp_alias_ddrs.length ())
3360     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3361 
3362   return true;
3363 }
3364 
3365 /* Return true if a non-affine read or write in STMT is suitable for a
3366    gather load or scatter store.  Describe the operation in *INFO if so.  */
3367 
3368 bool
3369 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3370 			   gather_scatter_info *info)
3371 {
3372   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3373   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3374   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3375   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3376   tree offtype = NULL_TREE;
3377   tree decl, base, off;
3378   machine_mode pmode;
3379   int punsignedp, reversep, pvolatilep = 0;
3380 
3381   base = DR_REF (dr);
3382   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3383      see if we can use the def stmt of the address.  */
3384   if (is_gimple_call (stmt)
3385       && gimple_call_internal_p (stmt)
3386       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3387 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3388       && TREE_CODE (base) == MEM_REF
3389       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3390       && integer_zerop (TREE_OPERAND (base, 1))
3391       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3392     {
3393       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3394       if (is_gimple_assign (def_stmt)
3395 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3396 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3397     }
3398 
3399   /* The gather and scatter builtins need address of the form
3400      loop_invariant + vector * {1, 2, 4, 8}
3401      or
3402      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3403      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3404      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3405      multiplications and additions in it.  To get a vector, we need
3406      a single SSA_NAME that will be defined in the loop and will
3407      contain everything that is not loop invariant and that can be
3408      vectorized.  The following code attempts to find such a preexistng
3409      SSA_NAME OFF and put the loop invariants into a tree BASE
3410      that can be gimplified before the loop.  */
3411   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3412 			      &punsignedp, &reversep, &pvolatilep);
3413   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3414 
3415   if (TREE_CODE (base) == MEM_REF)
3416     {
3417       if (!integer_zerop (TREE_OPERAND (base, 1)))
3418 	{
3419 	  if (off == NULL_TREE)
3420 	    {
3421 	      offset_int moff = mem_ref_offset (base);
3422 	      off = wide_int_to_tree (sizetype, moff);
3423 	    }
3424 	  else
3425 	    off = size_binop (PLUS_EXPR, off,
3426 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3427 	}
3428       base = TREE_OPERAND (base, 0);
3429     }
3430   else
3431     base = build_fold_addr_expr (base);
3432 
3433   if (off == NULL_TREE)
3434     off = size_zero_node;
3435 
3436   /* If base is not loop invariant, either off is 0, then we start with just
3437      the constant offset in the loop invariant BASE and continue with base
3438      as OFF, otherwise give up.
3439      We could handle that case by gimplifying the addition of base + off
3440      into some SSA_NAME and use that as off, but for now punt.  */
3441   if (!expr_invariant_in_loop_p (loop, base))
3442     {
3443       if (!integer_zerop (off))
3444 	return false;
3445       off = base;
3446       base = size_int (pbitpos / BITS_PER_UNIT);
3447     }
3448   /* Otherwise put base + constant offset into the loop invariant BASE
3449      and continue with OFF.  */
3450   else
3451     {
3452       base = fold_convert (sizetype, base);
3453       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3454     }
3455 
3456   /* OFF at this point may be either a SSA_NAME or some tree expression
3457      from get_inner_reference.  Try to peel off loop invariants from it
3458      into BASE as long as possible.  */
3459   STRIP_NOPS (off);
3460   while (offtype == NULL_TREE)
3461     {
3462       enum tree_code code;
3463       tree op0, op1, add = NULL_TREE;
3464 
3465       if (TREE_CODE (off) == SSA_NAME)
3466 	{
3467 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3468 
3469 	  if (expr_invariant_in_loop_p (loop, off))
3470 	    return false;
3471 
3472 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3473 	    break;
3474 
3475 	  op0 = gimple_assign_rhs1 (def_stmt);
3476 	  code = gimple_assign_rhs_code (def_stmt);
3477 	  op1 = gimple_assign_rhs2 (def_stmt);
3478 	}
3479       else
3480 	{
3481 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3482 	    return false;
3483 	  code = TREE_CODE (off);
3484 	  extract_ops_from_tree (off, &code, &op0, &op1);
3485 	}
3486       switch (code)
3487 	{
3488 	case POINTER_PLUS_EXPR:
3489 	case PLUS_EXPR:
3490 	  if (expr_invariant_in_loop_p (loop, op0))
3491 	    {
3492 	      add = op0;
3493 	      off = op1;
3494 	    do_add:
3495 	      add = fold_convert (sizetype, add);
3496 	      if (scale != 1)
3497 		add = size_binop (MULT_EXPR, add, size_int (scale));
3498 	      base = size_binop (PLUS_EXPR, base, add);
3499 	      continue;
3500 	    }
3501 	  if (expr_invariant_in_loop_p (loop, op1))
3502 	    {
3503 	      add = op1;
3504 	      off = op0;
3505 	      goto do_add;
3506 	    }
3507 	  break;
3508 	case MINUS_EXPR:
3509 	  if (expr_invariant_in_loop_p (loop, op1))
3510 	    {
3511 	      add = fold_convert (sizetype, op1);
3512 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3513 	      off = op0;
3514 	      goto do_add;
3515 	    }
3516 	  break;
3517 	case MULT_EXPR:
3518 	  if (scale == 1 && tree_fits_shwi_p (op1))
3519 	    {
3520 	      scale = tree_to_shwi (op1);
3521 	      off = op0;
3522 	      continue;
3523 	    }
3524 	  break;
3525 	case SSA_NAME:
3526 	  off = op0;
3527 	  continue;
3528 	CASE_CONVERT:
3529 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3530 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3531 	    break;
3532 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3533 	      == TYPE_PRECISION (TREE_TYPE (off)))
3534 	    {
3535 	      off = op0;
3536 	      continue;
3537 	    }
3538 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3539 	      < TYPE_PRECISION (TREE_TYPE (off)))
3540 	    {
3541 	      off = op0;
3542 	      offtype = TREE_TYPE (off);
3543 	      STRIP_NOPS (off);
3544 	      continue;
3545 	    }
3546 	  break;
3547 	default:
3548 	  break;
3549 	}
3550       break;
3551     }
3552 
3553   /* If at the end OFF still isn't a SSA_NAME or isn't
3554      defined in the loop, punt.  */
3555   if (TREE_CODE (off) != SSA_NAME
3556       || expr_invariant_in_loop_p (loop, off))
3557     return false;
3558 
3559   if (offtype == NULL_TREE)
3560     offtype = TREE_TYPE (off);
3561 
3562   if (DR_IS_READ (dr))
3563     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3564 					     offtype, scale);
3565   else
3566     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3567 					      offtype, scale);
3568 
3569   if (decl == NULL_TREE)
3570     return false;
3571 
3572   info->decl = decl;
3573   info->base = base;
3574   info->offset = off;
3575   info->offset_dt = vect_unknown_def_type;
3576   info->offset_vectype = NULL_TREE;
3577   info->scale = scale;
3578   return true;
3579 }
3580 
3581 /* Function vect_analyze_data_refs.
3582 
3583   Find all the data references in the loop or basic block.
3584 
3585    The general structure of the analysis of data refs in the vectorizer is as
3586    follows:
3587    1- vect_analyze_data_refs(loop/bb): call
3588       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3589       in the loop/bb and their dependences.
3590    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3591    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3592    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3593 
3594 */
3595 
3596 bool
3597 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3598 {
3599   struct loop *loop = NULL;
3600   unsigned int i;
3601   struct data_reference *dr;
3602   tree scalar_type;
3603 
3604   if (dump_enabled_p ())
3605     dump_printf_loc (MSG_NOTE, vect_location,
3606 		     "=== vect_analyze_data_refs ===\n");
3607 
3608   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3609     loop = LOOP_VINFO_LOOP (loop_vinfo);
3610 
3611   /* Go through the data-refs, check that the analysis succeeded.  Update
3612      pointer from stmt_vec_info struct to DR and vectype.  */
3613 
3614   vec<data_reference_p> datarefs = vinfo->datarefs;
3615   FOR_EACH_VEC_ELT (datarefs, i, dr)
3616     {
3617       gimple *stmt;
3618       stmt_vec_info stmt_info;
3619       tree base, offset, init;
3620       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3621       bool simd_lane_access = false;
3622       int vf;
3623 
3624 again:
3625       if (!dr || !DR_REF (dr))
3626         {
3627           if (dump_enabled_p ())
3628 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629 	                     "not vectorized: unhandled data-ref\n");
3630           return false;
3631         }
3632 
3633       stmt = DR_STMT (dr);
3634       stmt_info = vinfo_for_stmt (stmt);
3635 
3636       /* Discard clobbers from the dataref vector.  We will remove
3637          clobber stmts during vectorization.  */
3638       if (gimple_clobber_p (stmt))
3639 	{
3640 	  free_data_ref (dr);
3641 	  if (i == datarefs.length () - 1)
3642 	    {
3643 	      datarefs.pop ();
3644 	      break;
3645 	    }
3646 	  datarefs.ordered_remove (i);
3647 	  dr = datarefs[i];
3648 	  goto again;
3649 	}
3650 
3651       /* Check that analysis of the data-ref succeeded.  */
3652       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3653 	  || !DR_STEP (dr))
3654         {
3655 	  bool maybe_gather
3656 	    = DR_IS_READ (dr)
3657 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3658 	      && targetm.vectorize.builtin_gather != NULL;
3659 	  bool maybe_scatter
3660 	    = DR_IS_WRITE (dr)
3661 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3662 	      && targetm.vectorize.builtin_scatter != NULL;
3663 	  bool maybe_simd_lane_access
3664 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
3665 
3666 	  /* If target supports vector gather loads or scatter stores, or if
3667 	     this might be a SIMD lane access, see if they can't be used.  */
3668 	  if (is_a <loop_vec_info> (vinfo)
3669 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3670 	      && !nested_in_vect_loop_p (loop, stmt))
3671 	    {
3672 	      struct data_reference *newdr
3673 		= create_data_ref (NULL, loop_containing_stmt (stmt),
3674 				   DR_REF (dr), stmt, maybe_scatter ? false : true);
3675 	      gcc_assert (newdr != NULL && DR_REF (newdr));
3676 	      if (DR_BASE_ADDRESS (newdr)
3677 		  && DR_OFFSET (newdr)
3678 		  && DR_INIT (newdr)
3679 		  && DR_STEP (newdr)
3680 		  && integer_zerop (DR_STEP (newdr)))
3681 		{
3682 		  if (maybe_simd_lane_access)
3683 		    {
3684 		      tree off = DR_OFFSET (newdr);
3685 		      STRIP_NOPS (off);
3686 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3687 			  && TREE_CODE (off) == MULT_EXPR
3688 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3689 			{
3690 			  tree step = TREE_OPERAND (off, 1);
3691 			  off = TREE_OPERAND (off, 0);
3692 			  STRIP_NOPS (off);
3693 			  if (CONVERT_EXPR_P (off)
3694 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3695 									  0)))
3696 				 < TYPE_PRECISION (TREE_TYPE (off)))
3697 			    off = TREE_OPERAND (off, 0);
3698 			  if (TREE_CODE (off) == SSA_NAME)
3699 			    {
3700 			      gimple *def = SSA_NAME_DEF_STMT (off);
3701 			      tree reft = TREE_TYPE (DR_REF (newdr));
3702 			      if (is_gimple_call (def)
3703 				  && gimple_call_internal_p (def)
3704 				  && (gimple_call_internal_fn (def)
3705 				      == IFN_GOMP_SIMD_LANE))
3706 				{
3707 				  tree arg = gimple_call_arg (def, 0);
3708 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3709 				  arg = SSA_NAME_VAR (arg);
3710 				  if (arg == loop->simduid
3711 				      /* For now.  */
3712 				      && tree_int_cst_equal
3713 					   (TYPE_SIZE_UNIT (reft),
3714 					    step))
3715 				    {
3716 				      DR_OFFSET (newdr) = ssize_int (0);
3717 				      DR_STEP (newdr) = step;
3718 				      DR_ALIGNED_TO (newdr)
3719 					= size_int (BIGGEST_ALIGNMENT);
3720 				      dr = newdr;
3721 				      simd_lane_access = true;
3722 				    }
3723 				}
3724 			    }
3725 			}
3726 		    }
3727 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
3728 		    {
3729 		      dr = newdr;
3730 		      if (maybe_gather)
3731 			gatherscatter = GATHER;
3732 		      else
3733 			gatherscatter = SCATTER;
3734 		    }
3735 		}
3736 	      if (gatherscatter == SG_NONE && !simd_lane_access)
3737 		free_data_ref (newdr);
3738 	    }
3739 
3740 	  if (gatherscatter == SG_NONE && !simd_lane_access)
3741 	    {
3742 	      if (dump_enabled_p ())
3743 		{
3744 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3745                                    "not vectorized: data ref analysis "
3746                                    "failed ");
3747 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3748 		}
3749 
3750 	      if (is_a <bb_vec_info> (vinfo))
3751 		break;
3752 
3753 	      return false;
3754 	    }
3755         }
3756 
3757       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3758         {
3759           if (dump_enabled_p ())
3760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761                              "not vectorized: base addr of dr is a "
3762                              "constant\n");
3763 
3764           if (is_a <bb_vec_info> (vinfo))
3765 	    break;
3766 
3767 	  if (gatherscatter != SG_NONE || simd_lane_access)
3768 	    free_data_ref (dr);
3769 	  return false;
3770         }
3771 
3772       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3773         {
3774           if (dump_enabled_p ())
3775             {
3776               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3777                                "not vectorized: volatile type ");
3778               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3779             }
3780 
3781           if (is_a <bb_vec_info> (vinfo))
3782 	    break;
3783 
3784           return false;
3785         }
3786 
3787       if (stmt_can_throw_internal (stmt))
3788         {
3789           if (dump_enabled_p ())
3790             {
3791               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3792                                "not vectorized: statement can throw an "
3793                                "exception ");
3794               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3795             }
3796 
3797           if (is_a <bb_vec_info> (vinfo))
3798 	    break;
3799 
3800 	  if (gatherscatter != SG_NONE || simd_lane_access)
3801 	    free_data_ref (dr);
3802           return false;
3803         }
3804 
3805       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3806 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3807 	{
3808           if (dump_enabled_p ())
3809             {
3810               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3811                                "not vectorized: statement is bitfield "
3812                                "access ");
3813               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3814             }
3815 
3816           if (is_a <bb_vec_info> (vinfo))
3817 	    break;
3818 
3819 	  if (gatherscatter != SG_NONE || simd_lane_access)
3820 	    free_data_ref (dr);
3821           return false;
3822 	}
3823 
3824       base = unshare_expr (DR_BASE_ADDRESS (dr));
3825       offset = unshare_expr (DR_OFFSET (dr));
3826       init = unshare_expr (DR_INIT (dr));
3827 
3828       if (is_gimple_call (stmt)
3829 	  && (!gimple_call_internal_p (stmt)
3830 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3831 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3832 	{
3833 	  if (dump_enabled_p ())
3834 	    {
3835 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3836 	                       "not vectorized: dr in a call ");
3837 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3838 	    }
3839 
3840 	  if (is_a <bb_vec_info> (vinfo))
3841 	    break;
3842 
3843 	  if (gatherscatter != SG_NONE || simd_lane_access)
3844 	    free_data_ref (dr);
3845 	  return false;
3846 	}
3847 
3848       /* Update DR field in stmt_vec_info struct.  */
3849 
3850       /* If the dataref is in an inner-loop of the loop that is considered for
3851 	 for vectorization, we also want to analyze the access relative to
3852 	 the outer-loop (DR contains information only relative to the
3853 	 inner-most enclosing loop).  We do that by building a reference to the
3854 	 first location accessed by the inner-loop, and analyze it relative to
3855 	 the outer-loop.  */
3856       if (loop && nested_in_vect_loop_p (loop, stmt))
3857 	{
3858 	  tree outer_step, outer_base, outer_init;
3859 	  HOST_WIDE_INT pbitsize, pbitpos;
3860 	  tree poffset;
3861 	  machine_mode pmode;
3862 	  int punsignedp, preversep, pvolatilep;
3863 	  affine_iv base_iv, offset_iv;
3864 	  tree dinit;
3865 
3866 	  /* Build a reference to the first location accessed by the
3867 	     inner-loop: *(BASE+INIT).  (The first location is actually
3868 	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3869           tree inner_base = build_fold_indirect_ref
3870                                 (fold_build_pointer_plus (base, init));
3871 
3872 	  if (dump_enabled_p ())
3873 	    {
3874 	      dump_printf_loc (MSG_NOTE, vect_location,
3875                                "analyze in outer-loop: ");
3876 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3877 	      dump_printf (MSG_NOTE, "\n");
3878 	    }
3879 
3880 	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3881 					    &poffset, &pmode, &punsignedp,
3882 					    &preversep, &pvolatilep);
3883 	  gcc_assert (outer_base != NULL_TREE);
3884 
3885 	  if (pbitpos % BITS_PER_UNIT != 0)
3886 	    {
3887 	      if (dump_enabled_p ())
3888 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3889                                  "failed: bit offset alignment.\n");
3890 	      return false;
3891 	    }
3892 
3893 	  if (preversep)
3894 	    {
3895 	      if (dump_enabled_p ())
3896 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3897 				 "failed: reverse storage order.\n");
3898 	      return false;
3899 	    }
3900 
3901 	  outer_base = build_fold_addr_expr (outer_base);
3902 	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3903                           &base_iv, false))
3904 	    {
3905 	      if (dump_enabled_p ())
3906 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3907                                  "failed: evolution of base is not affine.\n");
3908 	      return false;
3909 	    }
3910 
3911 	  if (offset)
3912 	    {
3913 	      if (poffset)
3914 		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3915                                        poffset);
3916 	      else
3917 		poffset = offset;
3918 	    }
3919 
3920 	  if (!poffset)
3921 	    {
3922 	      offset_iv.base = ssize_int (0);
3923 	      offset_iv.step = ssize_int (0);
3924 	    }
3925 	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3926                                &offset_iv, false))
3927 	    {
3928 	      if (dump_enabled_p ())
3929 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3930                                  "evolution of offset is not affine.\n");
3931 	      return false;
3932 	    }
3933 
3934 	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3935 	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3936 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3937 	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3938 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3939 
3940 	  outer_step = size_binop (PLUS_EXPR,
3941 				fold_convert (ssizetype, base_iv.step),
3942 				fold_convert (ssizetype, offset_iv.step));
3943 
3944 	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3945 	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3946 	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3947 	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3948 	  STMT_VINFO_DR_OFFSET (stmt_info) =
3949 				fold_convert (ssizetype, offset_iv.base);
3950 	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3951 				size_int (highest_pow2_factor (offset_iv.base));
3952 
3953           if (dump_enabled_p ())
3954 	    {
3955 	      dump_printf_loc (MSG_NOTE, vect_location,
3956                                "\touter base_address: ");
3957 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3958                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3959 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3960 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3961                                  STMT_VINFO_DR_OFFSET (stmt_info));
3962 	      dump_printf (MSG_NOTE,
3963                            "\n\touter constant offset from base address: ");
3964 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3965                                  STMT_VINFO_DR_INIT (stmt_info));
3966 	      dump_printf (MSG_NOTE, "\n\touter step: ");
3967 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3968                                  STMT_VINFO_DR_STEP (stmt_info));
3969 	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3970 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3971                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3972 	      dump_printf (MSG_NOTE, "\n");
3973 	    }
3974 	}
3975 
3976       if (STMT_VINFO_DATA_REF (stmt_info))
3977         {
3978           if (dump_enabled_p ())
3979             {
3980               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981                                "not vectorized: more than one data ref "
3982                                "in stmt: ");
3983               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3984             }
3985 
3986           if (is_a <bb_vec_info> (vinfo))
3987 	    break;
3988 
3989 	  if (gatherscatter != SG_NONE || simd_lane_access)
3990 	    free_data_ref (dr);
3991           return false;
3992         }
3993 
3994       STMT_VINFO_DATA_REF (stmt_info) = dr;
3995       if (simd_lane_access)
3996 	{
3997 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3998 	  free_data_ref (datarefs[i]);
3999 	  datarefs[i] = dr;
4000 	}
4001 
4002       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4003 	  && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4004 	  && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4005 	{
4006           if (dump_enabled_p ())
4007             {
4008               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4009                                "not vectorized: base object not addressable "
4010 			       "for stmt: ");
4011               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4012             }
4013           if (is_a <bb_vec_info> (vinfo))
4014 	    {
4015 	      /* In BB vectorization the ref can still participate
4016 	         in dependence analysis, we just can't vectorize it.  */
4017 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4018 	      continue;
4019 	    }
4020 	  return false;
4021 	}
4022 
4023       /* Set vectype for STMT.  */
4024       scalar_type = TREE_TYPE (DR_REF (dr));
4025       STMT_VINFO_VECTYPE (stmt_info)
4026 	= get_vectype_for_scalar_type (scalar_type);
4027       if (!STMT_VINFO_VECTYPE (stmt_info))
4028         {
4029           if (dump_enabled_p ())
4030             {
4031               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4032                                "not vectorized: no vectype for stmt: ");
4033               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4034               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4035               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4036                                  scalar_type);
4037               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4038             }
4039 
4040           if (is_a <bb_vec_info> (vinfo))
4041 	    {
4042 	      /* No vector type is fine, the ref can still participate
4043 	         in dependence analysis, we just can't vectorize it.  */
4044 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4045 	      continue;
4046 	    }
4047 
4048 	  if (gatherscatter != SG_NONE || simd_lane_access)
4049 	    {
4050 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4051 	      if (gatherscatter != SG_NONE)
4052 		free_data_ref (dr);
4053 	    }
4054 	  return false;
4055         }
4056       else
4057 	{
4058 	  if (dump_enabled_p ())
4059 	    {
4060 	      dump_printf_loc (MSG_NOTE, vect_location,
4061 			       "got vectype for stmt: ");
4062 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4063 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
4064 				 STMT_VINFO_VECTYPE (stmt_info));
4065 	      dump_printf (MSG_NOTE, "\n");
4066 	    }
4067 	}
4068 
4069       /* Adjust the minimal vectorization factor according to the
4070 	 vector type.  */
4071       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4072       if (vf > *min_vf)
4073 	*min_vf = vf;
4074 
4075       if (gatherscatter != SG_NONE)
4076 	{
4077 	  gather_scatter_info gs_info;
4078 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4079 					  &gs_info)
4080 	      || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4081 	    {
4082 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
4083 	      free_data_ref (dr);
4084 	      if (dump_enabled_p ())
4085 		{
4086 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4087 				   (gatherscatter == GATHER) ?
4088 				   "not vectorized: not suitable for gather "
4089 				   "load " :
4090 				   "not vectorized: not suitable for scatter "
4091 				   "store ");
4092 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4093 		}
4094 	      return false;
4095 	    }
4096 
4097 	  free_data_ref (datarefs[i]);
4098 	  datarefs[i] = dr;
4099 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4100 	}
4101 
4102       else if (is_a <loop_vec_info> (vinfo)
4103 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4104 	{
4105 	  if (nested_in_vect_loop_p (loop, stmt))
4106 	    {
4107 	      if (dump_enabled_p ())
4108 		{
4109 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4110                                    "not vectorized: not suitable for strided "
4111                                    "load ");
4112 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4113 		}
4114 	      return false;
4115 	    }
4116 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4117 	}
4118     }
4119 
4120   /* If we stopped analysis at the first dataref we could not analyze
4121      when trying to vectorize a basic-block mark the rest of the datarefs
4122      as not vectorizable and truncate the vector of datarefs.  That
4123      avoids spending useless time in analyzing their dependence.  */
4124   if (i != datarefs.length ())
4125     {
4126       gcc_assert (is_a <bb_vec_info> (vinfo));
4127       for (unsigned j = i; j < datarefs.length (); ++j)
4128 	{
4129 	  data_reference_p dr = datarefs[j];
4130           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4131 	  free_data_ref (dr);
4132 	}
4133       datarefs.truncate (i);
4134     }
4135 
4136   return true;
4137 }
4138 
4139 
4140 /* Function vect_get_new_vect_var.
4141 
4142    Returns a name for a new variable.  The current naming scheme appends the
4143    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4144    the name of vectorizer generated variables, and appends that to NAME if
4145    provided.  */
4146 
4147 tree
4148 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4149 {
4150   const char *prefix;
4151   tree new_vect_var;
4152 
4153   switch (var_kind)
4154   {
4155   case vect_simple_var:
4156     prefix = "vect";
4157     break;
4158   case vect_scalar_var:
4159     prefix = "stmp";
4160     break;
4161   case vect_mask_var:
4162     prefix = "mask";
4163     break;
4164   case vect_pointer_var:
4165     prefix = "vectp";
4166     break;
4167   default:
4168     gcc_unreachable ();
4169   }
4170 
4171   if (name)
4172     {
4173       char* tmp = concat (prefix, "_", name, NULL);
4174       new_vect_var = create_tmp_reg (type, tmp);
4175       free (tmp);
4176     }
4177   else
4178     new_vect_var = create_tmp_reg (type, prefix);
4179 
4180   return new_vect_var;
4181 }
4182 
4183 /* Like vect_get_new_vect_var but return an SSA name.  */
4184 
4185 tree
4186 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4187 {
4188   const char *prefix;
4189   tree new_vect_var;
4190 
4191   switch (var_kind)
4192   {
4193   case vect_simple_var:
4194     prefix = "vect";
4195     break;
4196   case vect_scalar_var:
4197     prefix = "stmp";
4198     break;
4199   case vect_pointer_var:
4200     prefix = "vectp";
4201     break;
4202   default:
4203     gcc_unreachable ();
4204   }
4205 
4206   if (name)
4207     {
4208       char* tmp = concat (prefix, "_", name, NULL);
4209       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4210       free (tmp);
4211     }
4212   else
4213     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4214 
4215   return new_vect_var;
4216 }
4217 
4218 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4219 
4220 static void
4221 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4222 				  stmt_vec_info stmt_info)
4223 {
4224   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4225   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4226   int misalign = DR_MISALIGNMENT (dr);
4227   if (misalign == -1)
4228     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4229   else
4230     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4231 }
4232 
4233 /* Function vect_create_addr_base_for_vector_ref.
4234 
4235    Create an expression that computes the address of the first memory location
4236    that will be accessed for a data reference.
4237 
4238    Input:
4239    STMT: The statement containing the data reference.
4240    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4241    OFFSET: Optional. If supplied, it is be added to the initial address.
4242    LOOP:    Specify relative to which loop-nest should the address be computed.
4243             For example, when the dataref is in an inner-loop nested in an
4244 	    outer-loop that is now being vectorized, LOOP can be either the
4245 	    outer-loop, or the inner-loop.  The first memory location accessed
4246 	    by the following dataref ('in' points to short):
4247 
4248 		for (i=0; i<N; i++)
4249 		   for (j=0; j<M; j++)
4250 		     s += in[i+j]
4251 
4252 	    is as follows:
4253 	    if LOOP=i_loop:	&in		(relative to i_loop)
4254 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4255    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4256 	    initial address.  Unlike OFFSET, which is number of elements to
4257 	    be added, BYTE_OFFSET is measured in bytes.
4258 
4259    Output:
4260    1. Return an SSA_NAME whose value is the address of the memory location of
4261       the first vector of the data reference.
4262    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4263       these statement(s) which define the returned SSA_NAME.
4264 
4265    FORNOW: We are only handling array accesses with step 1.  */
4266 
4267 tree
4268 vect_create_addr_base_for_vector_ref (gimple *stmt,
4269 				      gimple_seq *new_stmt_list,
4270 				      tree offset,
4271 				      struct loop *loop,
4272 				      tree byte_offset)
4273 {
4274   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4275   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4276   tree data_ref_base;
4277   const char *base_name;
4278   tree addr_base;
4279   tree dest;
4280   gimple_seq seq = NULL;
4281   tree base_offset;
4282   tree init;
4283   tree vect_ptr_type;
4284   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4285   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4286 
4287   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4288     {
4289       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4290 
4291       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4292 
4293       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4294       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4295       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4296     }
4297   else
4298     {
4299       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4300       base_offset = unshare_expr (DR_OFFSET (dr));
4301       init = unshare_expr (DR_INIT (dr));
4302     }
4303 
4304   if (loop_vinfo)
4305     base_name = get_name (data_ref_base);
4306   else
4307     {
4308       base_offset = ssize_int (0);
4309       init = ssize_int (0);
4310       base_name = get_name (DR_REF (dr));
4311     }
4312 
4313   /* Create base_offset */
4314   base_offset = size_binop (PLUS_EXPR,
4315 			    fold_convert (sizetype, base_offset),
4316 			    fold_convert (sizetype, init));
4317 
4318   if (offset)
4319     {
4320       offset = fold_build2 (MULT_EXPR, sizetype,
4321 			    fold_convert (sizetype, offset), step);
4322       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4323 				 base_offset, offset);
4324     }
4325   if (byte_offset)
4326     {
4327       byte_offset = fold_convert (sizetype, byte_offset);
4328       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4329 				 base_offset, byte_offset);
4330     }
4331 
4332   /* base + base_offset */
4333   if (loop_vinfo)
4334     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4335   else
4336     {
4337       addr_base = build1 (ADDR_EXPR,
4338 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4339 			  unshare_expr (DR_REF (dr)));
4340     }
4341 
4342   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4343   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4344   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4345   gimple_seq_add_seq (new_stmt_list, seq);
4346 
4347   if (DR_PTR_INFO (dr)
4348       && TREE_CODE (addr_base) == SSA_NAME
4349       && !SSA_NAME_PTR_INFO (addr_base))
4350     {
4351       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4352       if (offset || byte_offset)
4353 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4354     }
4355 
4356   if (dump_enabled_p ())
4357     {
4358       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4359       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4360       dump_printf (MSG_NOTE, "\n");
4361     }
4362 
4363   return addr_base;
4364 }
4365 
4366 
4367 /* Function vect_create_data_ref_ptr.
4368 
4369    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4370    location accessed in the loop by STMT, along with the def-use update
4371    chain to appropriately advance the pointer through the loop iterations.
4372    Also set aliasing information for the pointer.  This pointer is used by
4373    the callers to this function to create a memory reference expression for
4374    vector load/store access.
4375 
4376    Input:
4377    1. STMT: a stmt that references memory. Expected to be of the form
4378          GIMPLE_ASSIGN <name, data-ref> or
4379 	 GIMPLE_ASSIGN <data-ref, name>.
4380    2. AGGR_TYPE: the type of the reference, which should be either a vector
4381         or an array.
4382    3. AT_LOOP: the loop where the vector memref is to be created.
4383    4. OFFSET (optional): an offset to be added to the initial address accessed
4384         by the data-ref in STMT.
4385    5. BSI: location where the new stmts are to be placed if there is no loop
4386    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4387         pointing to the initial address.
4388    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4389 	to the initial address accessed by the data-ref in STMT.  This is
4390 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4391 	in bytes.
4392 
4393    Output:
4394    1. Declare a new ptr to vector_type, and have it point to the base of the
4395       data reference (initial addressed accessed by the data reference).
4396       For example, for vector of type V8HI, the following code is generated:
4397 
4398       v8hi *ap;
4399       ap = (v8hi *)initial_address;
4400 
4401       if OFFSET is not supplied:
4402          initial_address = &a[init];
4403       if OFFSET is supplied:
4404          initial_address = &a[init + OFFSET];
4405       if BYTE_OFFSET is supplied:
4406 	 initial_address = &a[init] + BYTE_OFFSET;
4407 
4408       Return the initial_address in INITIAL_ADDRESS.
4409 
4410    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4411       update the pointer in each iteration of the loop.
4412 
4413       Return the increment stmt that updates the pointer in PTR_INCR.
4414 
4415    3. Set INV_P to true if the access pattern of the data reference in the
4416       vectorized loop is invariant.  Set it to false otherwise.
4417 
4418    4. Return the pointer.  */
4419 
4420 tree
4421 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4422 			  tree offset, tree *initial_address,
4423 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4424 			  bool only_init, bool *inv_p, tree byte_offset)
4425 {
4426   const char *base_name;
4427   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4428   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4429   struct loop *loop = NULL;
4430   bool nested_in_vect_loop = false;
4431   struct loop *containing_loop = NULL;
4432   tree aggr_ptr_type;
4433   tree aggr_ptr;
4434   tree new_temp;
4435   gimple_seq new_stmt_list = NULL;
4436   edge pe = NULL;
4437   basic_block new_bb;
4438   tree aggr_ptr_init;
4439   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4440   tree aptr;
4441   gimple_stmt_iterator incr_gsi;
4442   bool insert_after;
4443   tree indx_before_incr, indx_after_incr;
4444   gimple *incr;
4445   tree step;
4446   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4447 
4448   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4449 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4450 
4451   if (loop_vinfo)
4452     {
4453       loop = LOOP_VINFO_LOOP (loop_vinfo);
4454       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4455       containing_loop = (gimple_bb (stmt))->loop_father;
4456       pe = loop_preheader_edge (loop);
4457     }
4458   else
4459     {
4460       gcc_assert (bb_vinfo);
4461       only_init = true;
4462       *ptr_incr = NULL;
4463     }
4464 
4465   /* Check the step (evolution) of the load in LOOP, and record
4466      whether it's invariant.  */
4467   if (nested_in_vect_loop)
4468     step = STMT_VINFO_DR_STEP (stmt_info);
4469   else
4470     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4471 
4472   if (integer_zerop (step))
4473     *inv_p = true;
4474   else
4475     *inv_p = false;
4476 
4477   /* Create an expression for the first address accessed by this load
4478      in LOOP.  */
4479   base_name = get_name (DR_BASE_ADDRESS (dr));
4480 
4481   if (dump_enabled_p ())
4482     {
4483       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4484       dump_printf_loc (MSG_NOTE, vect_location,
4485                        "create %s-pointer variable to type: ",
4486 		       get_tree_code_name (TREE_CODE (aggr_type)));
4487       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4488       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4489         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4490       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4491         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4492       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4493         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4494       else
4495         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4496       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4497       dump_printf (MSG_NOTE, "\n");
4498     }
4499 
4500   /* (1) Create the new aggregate-pointer variable.
4501      Vector and array types inherit the alias set of their component
4502      type by default so we need to use a ref-all pointer if the data
4503      reference does not conflict with the created aggregated data
4504      reference because it is not addressable.  */
4505   bool need_ref_all = false;
4506   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4507 			      get_alias_set (DR_REF (dr))))
4508     need_ref_all = true;
4509   /* Likewise for any of the data references in the stmt group.  */
4510   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4511     {
4512       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4513       do
4514 	{
4515 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4516 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4517 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4518 				      get_alias_set (DR_REF (sdr))))
4519 	    {
4520 	      need_ref_all = true;
4521 	      break;
4522 	    }
4523 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4524 	}
4525       while (orig_stmt);
4526     }
4527   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4528 					       need_ref_all);
4529   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4530 
4531 
4532   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4533      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4534      def-use update cycles for the pointer: one relative to the outer-loop
4535      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4536      to the inner-loop (which is the inner-most loop containing the dataref),
4537      and this is done be step (5) below.
4538 
4539      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4540      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4541      redundant.  Steps (3),(4) create the following:
4542 
4543 	vp0 = &base_addr;
4544 	LOOP:	vp1 = phi(vp0,vp2)
4545 		...
4546 		...
4547 		vp2 = vp1 + step
4548 		goto LOOP
4549 
4550      If there is an inner-loop nested in loop, then step (5) will also be
4551      applied, and an additional update in the inner-loop will be created:
4552 
4553 	vp0 = &base_addr;
4554 	LOOP:   vp1 = phi(vp0,vp2)
4555 		...
4556         inner:     vp3 = phi(vp1,vp4)
4557 	           vp4 = vp3 + inner_step
4558 	           if () goto inner
4559 		...
4560 		vp2 = vp1 + step
4561 		if () goto LOOP   */
4562 
4563   /* (2) Calculate the initial address of the aggregate-pointer, and set
4564      the aggregate-pointer to point to it before the loop.  */
4565 
4566   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4567 
4568   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4569 						   offset, loop, byte_offset);
4570   if (new_stmt_list)
4571     {
4572       if (pe)
4573         {
4574           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4575           gcc_assert (!new_bb);
4576         }
4577       else
4578         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4579     }
4580 
4581   *initial_address = new_temp;
4582   aggr_ptr_init = new_temp;
4583 
4584   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4585      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4586      inner-loop nested in LOOP (during outer-loop vectorization).  */
4587 
4588   /* No update in loop is required.  */
4589   if (only_init && (!loop_vinfo || at_loop == loop))
4590     aptr = aggr_ptr_init;
4591   else
4592     {
4593       /* The step of the aggregate pointer is the type size.  */
4594       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4595       /* One exception to the above is when the scalar step of the load in
4596 	 LOOP is zero. In this case the step here is also zero.  */
4597       if (*inv_p)
4598 	iv_step = size_zero_node;
4599       else if (tree_int_cst_sgn (step) == -1)
4600 	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4601 
4602       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4603 
4604       create_iv (aggr_ptr_init,
4605 		 fold_convert (aggr_ptr_type, iv_step),
4606 		 aggr_ptr, loop, &incr_gsi, insert_after,
4607 		 &indx_before_incr, &indx_after_incr);
4608       incr = gsi_stmt (incr_gsi);
4609       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4610 
4611       /* Copy the points-to information if it exists. */
4612       if (DR_PTR_INFO (dr))
4613 	{
4614 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4615 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4616 	}
4617       if (ptr_incr)
4618 	*ptr_incr = incr;
4619 
4620       aptr = indx_before_incr;
4621     }
4622 
4623   if (!nested_in_vect_loop || only_init)
4624     return aptr;
4625 
4626 
4627   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4628      nested in LOOP, if exists.  */
4629 
4630   gcc_assert (nested_in_vect_loop);
4631   if (!only_init)
4632     {
4633       standard_iv_increment_position (containing_loop, &incr_gsi,
4634 				      &insert_after);
4635       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4636 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4637 		 &indx_after_incr);
4638       incr = gsi_stmt (incr_gsi);
4639       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4640 
4641       /* Copy the points-to information if it exists. */
4642       if (DR_PTR_INFO (dr))
4643 	{
4644 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4645 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4646 	}
4647       if (ptr_incr)
4648 	*ptr_incr = incr;
4649 
4650       return indx_before_incr;
4651     }
4652   else
4653     gcc_unreachable ();
4654 }
4655 
4656 
4657 /* Function bump_vector_ptr
4658 
4659    Increment a pointer (to a vector type) by vector-size. If requested,
4660    i.e. if PTR-INCR is given, then also connect the new increment stmt
4661    to the existing def-use update-chain of the pointer, by modifying
4662    the PTR_INCR as illustrated below:
4663 
4664    The pointer def-use update-chain before this function:
4665                         DATAREF_PTR = phi (p_0, p_2)
4666                         ....
4667         PTR_INCR:       p_2 = DATAREF_PTR + step
4668 
4669    The pointer def-use update-chain after this function:
4670                         DATAREF_PTR = phi (p_0, p_2)
4671                         ....
4672                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4673                         ....
4674         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4675 
4676    Input:
4677    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4678                  in the loop.
4679    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4680 	      the loop.  The increment amount across iterations is expected
4681 	      to be vector_size.
4682    BSI - location where the new update stmt is to be placed.
4683    STMT - the original scalar memory-access stmt that is being vectorized.
4684    BUMP - optional. The offset by which to bump the pointer. If not given,
4685 	  the offset is assumed to be vector_size.
4686 
4687    Output: Return NEW_DATAREF_PTR as illustrated above.
4688 
4689 */
4690 
4691 tree
4692 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4693 		 gimple *stmt, tree bump)
4694 {
4695   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4696   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4697   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4698   tree update = TYPE_SIZE_UNIT (vectype);
4699   gassign *incr_stmt;
4700   ssa_op_iter iter;
4701   use_operand_p use_p;
4702   tree new_dataref_ptr;
4703 
4704   if (bump)
4705     update = bump;
4706 
4707   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4708     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4709   else
4710     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4711   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4712 				   dataref_ptr, update);
4713   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4714 
4715   /* Copy the points-to information if it exists. */
4716   if (DR_PTR_INFO (dr))
4717     {
4718       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4719       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4720     }
4721 
4722   if (!ptr_incr)
4723     return new_dataref_ptr;
4724 
4725   /* Update the vector-pointer's cross-iteration increment.  */
4726   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4727     {
4728       tree use = USE_FROM_PTR (use_p);
4729 
4730       if (use == dataref_ptr)
4731         SET_USE (use_p, new_dataref_ptr);
4732       else
4733         gcc_assert (tree_int_cst_compare (use, update) == 0);
4734     }
4735 
4736   return new_dataref_ptr;
4737 }
4738 
4739 
4740 /* Function vect_create_destination_var.
4741 
4742    Create a new temporary of type VECTYPE.  */
4743 
4744 tree
4745 vect_create_destination_var (tree scalar_dest, tree vectype)
4746 {
4747   tree vec_dest;
4748   const char *name;
4749   char *new_name;
4750   tree type;
4751   enum vect_var_kind kind;
4752 
4753   kind = vectype
4754     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4755     ? vect_mask_var
4756     : vect_simple_var
4757     : vect_scalar_var;
4758   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4759 
4760   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4761 
4762   name = get_name (scalar_dest);
4763   if (name)
4764     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4765   else
4766     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4767   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4768   free (new_name);
4769 
4770   return vec_dest;
4771 }
4772 
4773 /* Function vect_grouped_store_supported.
4774 
4775    Returns TRUE if interleave high and interleave low permutations
4776    are supported, and FALSE otherwise.  */
4777 
4778 bool
4779 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4780 {
4781   machine_mode mode = TYPE_MODE (vectype);
4782 
4783   /* vect_permute_store_chain requires the group size to be equal to 3 or
4784      be a power of two.  */
4785   if (count != 3 && exact_log2 (count) == -1)
4786     {
4787       if (dump_enabled_p ())
4788 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4789 			 "the size of the group of accesses"
4790 			 " is not a power of 2 or not eqaul to 3\n");
4791       return false;
4792     }
4793 
4794   /* Check that the permutation is supported.  */
4795   if (VECTOR_MODE_P (mode))
4796     {
4797       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4798       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4799 
4800       if (count == 3)
4801 	{
4802 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
4803 	  unsigned int i, j;
4804 
4805 	  for (j = 0; j < 3; j++)
4806 	    {
4807 	      int nelt0 = ((3 - j) * nelt) % 3;
4808 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
4809 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
4810 	      for (i = 0; i < nelt; i++)
4811 		{
4812 		  if (3 * i + nelt0 < nelt)
4813 		    sel[3 * i + nelt0] = j0++;
4814 		  if (3 * i + nelt1 < nelt)
4815 		    sel[3 * i + nelt1] = nelt + j1++;
4816 		  if (3 * i + nelt2 < nelt)
4817 		    sel[3 * i + nelt2] = 0;
4818 		}
4819 	      if (!can_vec_perm_p (mode, false, sel))
4820 		{
4821 		  if (dump_enabled_p ())
4822 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4823 				 "permutaion op not supported by target.\n");
4824 		  return false;
4825 		}
4826 
4827 	      for (i = 0; i < nelt; i++)
4828 		{
4829 		  if (3 * i + nelt0 < nelt)
4830 		    sel[3 * i + nelt0] = 3 * i + nelt0;
4831 		  if (3 * i + nelt1 < nelt)
4832 		    sel[3 * i + nelt1] = 3 * i + nelt1;
4833 		  if (3 * i + nelt2 < nelt)
4834 		    sel[3 * i + nelt2] = nelt + j2++;
4835 		}
4836 	      if (!can_vec_perm_p (mode, false, sel))
4837 		{
4838 		  if (dump_enabled_p ())
4839 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4840 				 "permutaion op not supported by target.\n");
4841 		  return false;
4842 		}
4843 	    }
4844 	  return true;
4845 	}
4846       else
4847 	{
4848 	  /* If length is not equal to 3 then only power of 2 is supported.  */
4849 	  gcc_assert (pow2p_hwi (count));
4850 
4851 	  for (i = 0; i < nelt / 2; i++)
4852 	    {
4853 	      sel[i * 2] = i;
4854 	      sel[i * 2 + 1] = i + nelt;
4855 	    }
4856 	    if (can_vec_perm_p (mode, false, sel))
4857 	      {
4858 		for (i = 0; i < nelt; i++)
4859 		  sel[i] += nelt / 2;
4860 		if (can_vec_perm_p (mode, false, sel))
4861 		  return true;
4862 	      }
4863 	}
4864     }
4865 
4866   if (dump_enabled_p ())
4867     dump_printf (MSG_MISSED_OPTIMIZATION,
4868 		 "permutaion op not supported by target.\n");
4869   return false;
4870 }
4871 
4872 
4873 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4874    type VECTYPE.  */
4875 
4876 bool
4877 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4878 {
4879   return vect_lanes_optab_supported_p ("vec_store_lanes",
4880 				       vec_store_lanes_optab,
4881 				       vectype, count);
4882 }
4883 
4884 
4885 /* Function vect_permute_store_chain.
4886 
4887    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4888    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4889    the data correctly for the stores.  Return the final references for stores
4890    in RESULT_CHAIN.
4891 
4892    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4893    The input is 4 vectors each containing 8 elements.  We assign a number to
4894    each element, the input sequence is:
4895 
4896    1st vec:   0  1  2  3  4  5  6  7
4897    2nd vec:   8  9 10 11 12 13 14 15
4898    3rd vec:  16 17 18 19 20 21 22 23
4899    4th vec:  24 25 26 27 28 29 30 31
4900 
4901    The output sequence should be:
4902 
4903    1st vec:  0  8 16 24  1  9 17 25
4904    2nd vec:  2 10 18 26  3 11 19 27
4905    3rd vec:  4 12 20 28  5 13 21 30
4906    4th vec:  6 14 22 30  7 15 23 31
4907 
4908    i.e., we interleave the contents of the four vectors in their order.
4909 
4910    We use interleave_high/low instructions to create such output.  The input of
4911    each interleave_high/low operation is two vectors:
4912    1st vec    2nd vec
4913    0 1 2 3    4 5 6 7
4914    the even elements of the result vector are obtained left-to-right from the
4915    high/low elements of the first vector.  The odd elements of the result are
4916    obtained left-to-right from the high/low elements of the second vector.
4917    The output of interleave_high will be:   0 4 1 5
4918    and of interleave_low:                   2 6 3 7
4919 
4920 
4921    The permutation is done in log LENGTH stages.  In each stage interleave_high
4922    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4923    where the first argument is taken from the first half of DR_CHAIN and the
4924    second argument from it's second half.
4925    In our example,
4926 
4927    I1: interleave_high (1st vec, 3rd vec)
4928    I2: interleave_low (1st vec, 3rd vec)
4929    I3: interleave_high (2nd vec, 4th vec)
4930    I4: interleave_low (2nd vec, 4th vec)
4931 
4932    The output for the first stage is:
4933 
4934    I1:  0 16  1 17  2 18  3 19
4935    I2:  4 20  5 21  6 22  7 23
4936    I3:  8 24  9 25 10 26 11 27
4937    I4: 12 28 13 29 14 30 15 31
4938 
4939    The output of the second stage, i.e. the final result is:
4940 
4941    I1:  0  8 16 24  1  9 17 25
4942    I2:  2 10 18 26  3 11 19 27
4943    I3:  4 12 20 28  5 13 21 30
4944    I4:  6 14 22 30  7 15 23 31.  */
4945 
4946 void
4947 vect_permute_store_chain (vec<tree> dr_chain,
4948 			  unsigned int length,
4949 			  gimple *stmt,
4950 			  gimple_stmt_iterator *gsi,
4951 			  vec<tree> *result_chain)
4952 {
4953   tree vect1, vect2, high, low;
4954   gimple *perm_stmt;
4955   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4956   tree perm_mask_low, perm_mask_high;
4957   tree data_ref;
4958   tree perm3_mask_low, perm3_mask_high;
4959   unsigned int i, n, log_length = exact_log2 (length);
4960   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4961   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4962 
4963   result_chain->quick_grow (length);
4964   memcpy (result_chain->address (), dr_chain.address (),
4965 	  length * sizeof (tree));
4966 
4967   if (length == 3)
4968     {
4969       unsigned int j0 = 0, j1 = 0, j2 = 0;
4970 
4971       for (j = 0; j < 3; j++)
4972         {
4973 	  int nelt0 = ((3 - j) * nelt) % 3;
4974 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
4975 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
4976 
4977 	  for (i = 0; i < nelt; i++)
4978 	    {
4979 	      if (3 * i + nelt0 < nelt)
4980 		sel[3 * i + nelt0] = j0++;
4981 	      if (3 * i + nelt1 < nelt)
4982 		sel[3 * i + nelt1] = nelt + j1++;
4983 	      if (3 * i + nelt2 < nelt)
4984 		sel[3 * i + nelt2] = 0;
4985 	    }
4986 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4987 
4988 	  for (i = 0; i < nelt; i++)
4989 	    {
4990 	      if (3 * i + nelt0 < nelt)
4991 		sel[3 * i + nelt0] = 3 * i + nelt0;
4992 	      if (3 * i + nelt1 < nelt)
4993 		sel[3 * i + nelt1] = 3 * i + nelt1;
4994 	      if (3 * i + nelt2 < nelt)
4995 		sel[3 * i + nelt2] = nelt + j2++;
4996 	    }
4997 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4998 
4999 	  vect1 = dr_chain[0];
5000 	  vect2 = dr_chain[1];
5001 
5002 	  /* Create interleaving stmt:
5003 	     low = VEC_PERM_EXPR <vect1, vect2,
5004 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5005 				   j + 2, nelt + j + 2, *, ...}>  */
5006 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5007 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5008 					   vect2, perm3_mask_low);
5009 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5010 
5011 	  vect1 = data_ref;
5012 	  vect2 = dr_chain[2];
5013 	  /* Create interleaving stmt:
5014 	     low = VEC_PERM_EXPR <vect1, vect2,
5015 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5016 				   6, 7, nelt + j + 2, ...}>  */
5017 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5018 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5019 					   vect2, perm3_mask_high);
5020 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5021 	  (*result_chain)[j] = data_ref;
5022 	}
5023     }
5024   else
5025     {
5026       /* If length is not equal to 3 then only power of 2 is supported.  */
5027       gcc_assert (pow2p_hwi (length));
5028 
5029       for (i = 0, n = nelt / 2; i < n; i++)
5030 	{
5031 	  sel[i * 2] = i;
5032 	  sel[i * 2 + 1] = i + nelt;
5033 	}
5034 	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5035 
5036 	for (i = 0; i < nelt; i++)
5037 	  sel[i] += nelt / 2;
5038 	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5039 
5040 	for (i = 0, n = log_length; i < n; i++)
5041 	  {
5042 	    for (j = 0; j < length/2; j++)
5043 	      {
5044 		vect1 = dr_chain[j];
5045 		vect2 = dr_chain[j+length/2];
5046 
5047 		/* Create interleaving stmt:
5048 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5049 							...}>  */
5050 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5051 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5052 						 vect2, perm_mask_high);
5053 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5054 		(*result_chain)[2*j] = high;
5055 
5056 		/* Create interleaving stmt:
5057 		   low = VEC_PERM_EXPR <vect1, vect2,
5058 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5059 					 ...}>  */
5060 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5061 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5062 						 vect2, perm_mask_low);
5063 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5064 		(*result_chain)[2*j+1] = low;
5065 	      }
5066 	    memcpy (dr_chain.address (), result_chain->address (),
5067 		    length * sizeof (tree));
5068 	  }
5069     }
5070 }
5071 
5072 /* Function vect_setup_realignment
5073 
5074    This function is called when vectorizing an unaligned load using
5075    the dr_explicit_realign[_optimized] scheme.
5076    This function generates the following code at the loop prolog:
5077 
5078       p = initial_addr;
5079    x  msq_init = *(floor(p));   # prolog load
5080       realignment_token = call target_builtin;
5081     loop:
5082    x  msq = phi (msq_init, ---)
5083 
5084    The stmts marked with x are generated only for the case of
5085    dr_explicit_realign_optimized.
5086 
5087    The code above sets up a new (vector) pointer, pointing to the first
5088    location accessed by STMT, and a "floor-aligned" load using that pointer.
5089    It also generates code to compute the "realignment-token" (if the relevant
5090    target hook was defined), and creates a phi-node at the loop-header bb
5091    whose arguments are the result of the prolog-load (created by this
5092    function) and the result of a load that takes place in the loop (to be
5093    created by the caller to this function).
5094 
5095    For the case of dr_explicit_realign_optimized:
5096    The caller to this function uses the phi-result (msq) to create the
5097    realignment code inside the loop, and sets up the missing phi argument,
5098    as follows:
5099     loop:
5100       msq = phi (msq_init, lsq)
5101       lsq = *(floor(p'));        # load in loop
5102       result = realign_load (msq, lsq, realignment_token);
5103 
5104    For the case of dr_explicit_realign:
5105     loop:
5106       msq = *(floor(p)); 	# load in loop
5107       p' = p + (VS-1);
5108       lsq = *(floor(p'));	# load in loop
5109       result = realign_load (msq, lsq, realignment_token);
5110 
5111    Input:
5112    STMT - (scalar) load stmt to be vectorized. This load accesses
5113           a memory location that may be unaligned.
5114    BSI - place where new code is to be inserted.
5115    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5116 			      is used.
5117 
5118    Output:
5119    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5120                        target hook, if defined.
5121    Return value - the result of the loop-header phi node.  */
5122 
5123 tree
5124 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5125                         tree *realignment_token,
5126 			enum dr_alignment_support alignment_support_scheme,
5127 			tree init_addr,
5128 			struct loop **at_loop)
5129 {
5130   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5131   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5132   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5133   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5134   struct loop *loop = NULL;
5135   edge pe = NULL;
5136   tree scalar_dest = gimple_assign_lhs (stmt);
5137   tree vec_dest;
5138   gimple *inc;
5139   tree ptr;
5140   tree data_ref;
5141   basic_block new_bb;
5142   tree msq_init = NULL_TREE;
5143   tree new_temp;
5144   gphi *phi_stmt;
5145   tree msq = NULL_TREE;
5146   gimple_seq stmts = NULL;
5147   bool inv_p;
5148   bool compute_in_loop = false;
5149   bool nested_in_vect_loop = false;
5150   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5151   struct loop *loop_for_initial_load = NULL;
5152 
5153   if (loop_vinfo)
5154     {
5155       loop = LOOP_VINFO_LOOP (loop_vinfo);
5156       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5157     }
5158 
5159   gcc_assert (alignment_support_scheme == dr_explicit_realign
5160 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5161 
5162   /* We need to generate three things:
5163      1. the misalignment computation
5164      2. the extra vector load (for the optimized realignment scheme).
5165      3. the phi node for the two vectors from which the realignment is
5166       done (for the optimized realignment scheme).  */
5167 
5168   /* 1. Determine where to generate the misalignment computation.
5169 
5170      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5171      calculation will be generated by this function, outside the loop (in the
5172      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5173      caller, inside the loop.
5174 
5175      Background: If the misalignment remains fixed throughout the iterations of
5176      the loop, then both realignment schemes are applicable, and also the
5177      misalignment computation can be done outside LOOP.  This is because we are
5178      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5179      are a multiple of VS (the Vector Size), and therefore the misalignment in
5180      different vectorized LOOP iterations is always the same.
5181      The problem arises only if the memory access is in an inner-loop nested
5182      inside LOOP, which is now being vectorized using outer-loop vectorization.
5183      This is the only case when the misalignment of the memory access may not
5184      remain fixed throughout the iterations of the inner-loop (as explained in
5185      detail in vect_supportable_dr_alignment).  In this case, not only is the
5186      optimized realignment scheme not applicable, but also the misalignment
5187      computation (and generation of the realignment token that is passed to
5188      REALIGN_LOAD) have to be done inside the loop.
5189 
5190      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5191      or not, which in turn determines if the misalignment is computed inside
5192      the inner-loop, or outside LOOP.  */
5193 
5194   if (init_addr != NULL_TREE || !loop_vinfo)
5195     {
5196       compute_in_loop = true;
5197       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5198     }
5199 
5200 
5201   /* 2. Determine where to generate the extra vector load.
5202 
5203      For the optimized realignment scheme, instead of generating two vector
5204      loads in each iteration, we generate a single extra vector load in the
5205      preheader of the loop, and in each iteration reuse the result of the
5206      vector load from the previous iteration.  In case the memory access is in
5207      an inner-loop nested inside LOOP, which is now being vectorized using
5208      outer-loop vectorization, we need to determine whether this initial vector
5209      load should be generated at the preheader of the inner-loop, or can be
5210      generated at the preheader of LOOP.  If the memory access has no evolution
5211      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5212      to be generated inside LOOP (in the preheader of the inner-loop).  */
5213 
5214   if (nested_in_vect_loop)
5215     {
5216       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5217       bool invariant_in_outerloop =
5218             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5219       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5220     }
5221   else
5222     loop_for_initial_load = loop;
5223   if (at_loop)
5224     *at_loop = loop_for_initial_load;
5225 
5226   if (loop_for_initial_load)
5227     pe = loop_preheader_edge (loop_for_initial_load);
5228 
5229   /* 3. For the case of the optimized realignment, create the first vector
5230       load at the loop preheader.  */
5231 
5232   if (alignment_support_scheme == dr_explicit_realign_optimized)
5233     {
5234       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5235       gassign *new_stmt;
5236 
5237       gcc_assert (!compute_in_loop);
5238       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5239       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5240 				      NULL_TREE, &init_addr, NULL, &inc,
5241 				      true, &inv_p);
5242       if (TREE_CODE (ptr) == SSA_NAME)
5243 	new_temp = copy_ssa_name (ptr);
5244       else
5245 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5246       new_stmt = gimple_build_assign
5247 		   (new_temp, BIT_AND_EXPR, ptr,
5248 		    build_int_cst (TREE_TYPE (ptr),
5249 				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5250       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5251       gcc_assert (!new_bb);
5252       data_ref
5253 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5254 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5255       new_stmt = gimple_build_assign (vec_dest, data_ref);
5256       new_temp = make_ssa_name (vec_dest, new_stmt);
5257       gimple_assign_set_lhs (new_stmt, new_temp);
5258       if (pe)
5259         {
5260           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5261           gcc_assert (!new_bb);
5262         }
5263       else
5264          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5265 
5266       msq_init = gimple_assign_lhs (new_stmt);
5267     }
5268 
5269   /* 4. Create realignment token using a target builtin, if available.
5270       It is done either inside the containing loop, or before LOOP (as
5271       determined above).  */
5272 
5273   if (targetm.vectorize.builtin_mask_for_load)
5274     {
5275       gcall *new_stmt;
5276       tree builtin_decl;
5277 
5278       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5279       if (!init_addr)
5280 	{
5281 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5282 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5283 							NULL_TREE, loop);
5284           if (loop)
5285             {
5286    	      pe = loop_preheader_edge (loop);
5287 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5288 	      gcc_assert (!new_bb);
5289             }
5290           else
5291              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5292 	}
5293 
5294       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5295       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5296       vec_dest =
5297 	vect_create_destination_var (scalar_dest,
5298 				     gimple_call_return_type (new_stmt));
5299       new_temp = make_ssa_name (vec_dest, new_stmt);
5300       gimple_call_set_lhs (new_stmt, new_temp);
5301 
5302       if (compute_in_loop)
5303 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5304       else
5305 	{
5306 	  /* Generate the misalignment computation outside LOOP.  */
5307 	  pe = loop_preheader_edge (loop);
5308 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5309 	  gcc_assert (!new_bb);
5310 	}
5311 
5312       *realignment_token = gimple_call_lhs (new_stmt);
5313 
5314       /* The result of the CALL_EXPR to this builtin is determined from
5315          the value of the parameter and no global variables are touched
5316          which makes the builtin a "const" function.  Requiring the
5317          builtin to have the "const" attribute makes it unnecessary
5318          to call mark_call_clobbered.  */
5319       gcc_assert (TREE_READONLY (builtin_decl));
5320     }
5321 
5322   if (alignment_support_scheme == dr_explicit_realign)
5323     return msq;
5324 
5325   gcc_assert (!compute_in_loop);
5326   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5327 
5328 
5329   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5330 
5331   pe = loop_preheader_edge (containing_loop);
5332   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5333   msq = make_ssa_name (vec_dest);
5334   phi_stmt = create_phi_node (msq, containing_loop->header);
5335   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5336 
5337   return msq;
5338 }
5339 
5340 
5341 /* Function vect_grouped_load_supported.
5342 
5343    COUNT is the size of the load group (the number of statements plus the
5344    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5345    only one statement, with a gap of COUNT - 1.
5346 
5347    Returns true if a suitable permute exists.  */
5348 
5349 bool
5350 vect_grouped_load_supported (tree vectype, bool single_element_p,
5351 			     unsigned HOST_WIDE_INT count)
5352 {
5353   machine_mode mode = TYPE_MODE (vectype);
5354 
5355   /* If this is single-element interleaving with an element distance
5356      that leaves unused vector loads around punt - we at least create
5357      very sub-optimal code in that case (and blow up memory,
5358      see PR65518).  */
5359   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5360     {
5361       if (dump_enabled_p ())
5362 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5363 			 "single-element interleaving not supported "
5364 			 "for not adjacent vector loads\n");
5365       return false;
5366     }
5367 
5368   /* vect_permute_load_chain requires the group size to be equal to 3 or
5369      be a power of two.  */
5370   if (count != 3 && exact_log2 (count) == -1)
5371     {
5372       if (dump_enabled_p ())
5373 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5374 			 "the size of the group of accesses"
5375 			 " is not a power of 2 or not equal to 3\n");
5376       return false;
5377     }
5378 
5379   /* Check that the permutation is supported.  */
5380   if (VECTOR_MODE_P (mode))
5381     {
5382       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5383       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5384 
5385       if (count == 3)
5386 	{
5387 	  unsigned int k;
5388 	  for (k = 0; k < 3; k++)
5389 	    {
5390 	      for (i = 0; i < nelt; i++)
5391 		if (3 * i + k < 2 * nelt)
5392 		  sel[i] = 3 * i + k;
5393 		else
5394 		  sel[i] = 0;
5395 	      if (!can_vec_perm_p (mode, false, sel))
5396 		{
5397 		  if (dump_enabled_p ())
5398 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 				     "shuffle of 3 loads is not supported by"
5400 				     " target\n");
5401 		  return false;
5402 		}
5403 	      for (i = 0, j = 0; i < nelt; i++)
5404 		if (3 * i + k < 2 * nelt)
5405 		  sel[i] = i;
5406 		else
5407 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5408 	      if (!can_vec_perm_p (mode, false, sel))
5409 		{
5410 		  if (dump_enabled_p ())
5411 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5412 				     "shuffle of 3 loads is not supported by"
5413 				     " target\n");
5414 		  return false;
5415 		}
5416 	    }
5417 	  return true;
5418 	}
5419       else
5420 	{
5421 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5422 	  gcc_assert (pow2p_hwi (count));
5423 	  for (i = 0; i < nelt; i++)
5424 	    sel[i] = i * 2;
5425 	  if (can_vec_perm_p (mode, false, sel))
5426 	    {
5427 	      for (i = 0; i < nelt; i++)
5428 		sel[i] = i * 2 + 1;
5429 	      if (can_vec_perm_p (mode, false, sel))
5430 		return true;
5431 	    }
5432         }
5433     }
5434 
5435   if (dump_enabled_p ())
5436     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5437 		     "extract even/odd not supported by target\n");
5438   return false;
5439 }
5440 
5441 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5442    type VECTYPE.  */
5443 
5444 bool
5445 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5446 {
5447   return vect_lanes_optab_supported_p ("vec_load_lanes",
5448 				       vec_load_lanes_optab,
5449 				       vectype, count);
5450 }
5451 
5452 /* Function vect_permute_load_chain.
5453 
5454    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5455    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5456    the input data correctly.  Return the final references for loads in
5457    RESULT_CHAIN.
5458 
5459    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5460    The input is 4 vectors each containing 8 elements. We assign a number to each
5461    element, the input sequence is:
5462 
5463    1st vec:   0  1  2  3  4  5  6  7
5464    2nd vec:   8  9 10 11 12 13 14 15
5465    3rd vec:  16 17 18 19 20 21 22 23
5466    4th vec:  24 25 26 27 28 29 30 31
5467 
5468    The output sequence should be:
5469 
5470    1st vec:  0 4  8 12 16 20 24 28
5471    2nd vec:  1 5  9 13 17 21 25 29
5472    3rd vec:  2 6 10 14 18 22 26 30
5473    4th vec:  3 7 11 15 19 23 27 31
5474 
5475    i.e., the first output vector should contain the first elements of each
5476    interleaving group, etc.
5477 
5478    We use extract_even/odd instructions to create such output.  The input of
5479    each extract_even/odd operation is two vectors
5480    1st vec    2nd vec
5481    0 1 2 3    4 5 6 7
5482 
5483    and the output is the vector of extracted even/odd elements.  The output of
5484    extract_even will be:   0 2 4 6
5485    and of extract_odd:     1 3 5 7
5486 
5487 
5488    The permutation is done in log LENGTH stages.  In each stage extract_even
5489    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5490    their order.  In our example,
5491 
5492    E1: extract_even (1st vec, 2nd vec)
5493    E2: extract_odd (1st vec, 2nd vec)
5494    E3: extract_even (3rd vec, 4th vec)
5495    E4: extract_odd (3rd vec, 4th vec)
5496 
5497    The output for the first stage will be:
5498 
5499    E1:  0  2  4  6  8 10 12 14
5500    E2:  1  3  5  7  9 11 13 15
5501    E3: 16 18 20 22 24 26 28 30
5502    E4: 17 19 21 23 25 27 29 31
5503 
5504    In order to proceed and create the correct sequence for the next stage (or
5505    for the correct output, if the second stage is the last one, as in our
5506    example), we first put the output of extract_even operation and then the
5507    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5508    The input for the second stage is:
5509 
5510    1st vec (E1):  0  2  4  6  8 10 12 14
5511    2nd vec (E3): 16 18 20 22 24 26 28 30
5512    3rd vec (E2):  1  3  5  7  9 11 13 15
5513    4th vec (E4): 17 19 21 23 25 27 29 31
5514 
5515    The output of the second stage:
5516 
5517    E1: 0 4  8 12 16 20 24 28
5518    E2: 2 6 10 14 18 22 26 30
5519    E3: 1 5  9 13 17 21 25 29
5520    E4: 3 7 11 15 19 23 27 31
5521 
5522    And RESULT_CHAIN after reordering:
5523 
5524    1st vec (E1):  0 4  8 12 16 20 24 28
5525    2nd vec (E3):  1 5  9 13 17 21 25 29
5526    3rd vec (E2):  2 6 10 14 18 22 26 30
5527    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5528 
5529 static void
5530 vect_permute_load_chain (vec<tree> dr_chain,
5531 			 unsigned int length,
5532 			 gimple *stmt,
5533 			 gimple_stmt_iterator *gsi,
5534 			 vec<tree> *result_chain)
5535 {
5536   tree data_ref, first_vect, second_vect;
5537   tree perm_mask_even, perm_mask_odd;
5538   tree perm3_mask_low, perm3_mask_high;
5539   gimple *perm_stmt;
5540   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5541   unsigned int i, j, log_length = exact_log2 (length);
5542   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5543   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5544 
5545   result_chain->quick_grow (length);
5546   memcpy (result_chain->address (), dr_chain.address (),
5547 	  length * sizeof (tree));
5548 
5549   if (length == 3)
5550     {
5551       unsigned int k;
5552 
5553       for (k = 0; k < 3; k++)
5554 	{
5555 	  for (i = 0; i < nelt; i++)
5556 	    if (3 * i + k < 2 * nelt)
5557 	      sel[i] = 3 * i + k;
5558 	    else
5559 	      sel[i] = 0;
5560 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5561 
5562 	  for (i = 0, j = 0; i < nelt; i++)
5563 	    if (3 * i + k < 2 * nelt)
5564 	      sel[i] = i;
5565 	    else
5566 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5567 
5568 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5569 
5570 	  first_vect = dr_chain[0];
5571 	  second_vect = dr_chain[1];
5572 
5573 	  /* Create interleaving stmt (low part of):
5574 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5575 							     ...}>  */
5576 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5577 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5578 					   second_vect, perm3_mask_low);
5579 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5580 
5581 	  /* Create interleaving stmt (high part of):
5582 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5583 							      ...}>  */
5584 	  first_vect = data_ref;
5585 	  second_vect = dr_chain[2];
5586 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5587 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5588 					   second_vect, perm3_mask_high);
5589 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5590 	  (*result_chain)[k] = data_ref;
5591 	}
5592     }
5593   else
5594     {
5595       /* If length is not equal to 3 then only power of 2 is supported.  */
5596       gcc_assert (pow2p_hwi (length));
5597 
5598       for (i = 0; i < nelt; ++i)
5599 	sel[i] = i * 2;
5600       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5601 
5602       for (i = 0; i < nelt; ++i)
5603 	sel[i] = i * 2 + 1;
5604       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5605 
5606       for (i = 0; i < log_length; i++)
5607 	{
5608 	  for (j = 0; j < length; j += 2)
5609 	    {
5610 	      first_vect = dr_chain[j];
5611 	      second_vect = dr_chain[j+1];
5612 
5613 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5614 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5615 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5616 					       first_vect, second_vect,
5617 					       perm_mask_even);
5618 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5619 	      (*result_chain)[j/2] = data_ref;
5620 
5621 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5622 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5623 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624 					       first_vect, second_vect,
5625 					       perm_mask_odd);
5626 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5627 	      (*result_chain)[j/2+length/2] = data_ref;
5628 	    }
5629 	  memcpy (dr_chain.address (), result_chain->address (),
5630 		  length * sizeof (tree));
5631 	}
5632     }
5633 }
5634 
5635 /* Function vect_shift_permute_load_chain.
5636 
5637    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5638    sequence of stmts to reorder the input data accordingly.
5639    Return the final references for loads in RESULT_CHAIN.
5640    Return true if successed, false otherwise.
5641 
5642    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5643    The input is 3 vectors each containing 8 elements.  We assign a
5644    number to each element, the input sequence is:
5645 
5646    1st vec:   0  1  2  3  4  5  6  7
5647    2nd vec:   8  9 10 11 12 13 14 15
5648    3rd vec:  16 17 18 19 20 21 22 23
5649 
5650    The output sequence should be:
5651 
5652    1st vec:  0 3 6  9 12 15 18 21
5653    2nd vec:  1 4 7 10 13 16 19 22
5654    3rd vec:  2 5 8 11 14 17 20 23
5655 
5656    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5657 
5658    First we shuffle all 3 vectors to get correct elements order:
5659 
5660    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5661    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5662    3rd vec:  (16 19 22) (17 20 23) (18 21)
5663 
5664    Next we unite and shift vector 3 times:
5665 
5666    1st step:
5667      shift right by 6 the concatenation of:
5668      "1st vec" and  "2nd vec"
5669        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5670      "2nd vec" and  "3rd vec"
5671        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5672      "3rd vec" and  "1st vec"
5673        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5674 			     | New vectors                   |
5675 
5676      So that now new vectors are:
5677 
5678      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5679      2nd vec:  (10 13) (16 19 22) (17 20 23)
5680      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5681 
5682    2nd step:
5683      shift right by 5 the concatenation of:
5684      "1st vec" and  "3rd vec"
5685        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5686      "2nd vec" and  "1st vec"
5687        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5688      "3rd vec" and  "2nd vec"
5689        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5690 			  | New vectors                   |
5691 
5692      So that now new vectors are:
5693 
5694      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5695      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5696      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5697 
5698    3rd step:
5699      shift right by 5 the concatenation of:
5700      "1st vec" and  "1st vec"
5701        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5702      shift right by 3 the concatenation of:
5703      "2nd vec" and  "2nd vec"
5704                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5705 			  | New vectors                   |
5706 
5707      So that now all vectors are READY:
5708      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5709      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5710      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5711 
5712    This algorithm is faster than one in vect_permute_load_chain if:
5713      1.  "shift of a concatination" is faster than general permutation.
5714 	 This is usually so.
5715      2.  The TARGET machine can't execute vector instructions in parallel.
5716 	 This is because each step of the algorithm depends on previous.
5717 	 The algorithm in vect_permute_load_chain is much more parallel.
5718 
5719    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5720 */
5721 
5722 static bool
5723 vect_shift_permute_load_chain (vec<tree> dr_chain,
5724 			       unsigned int length,
5725 			       gimple *stmt,
5726 			       gimple_stmt_iterator *gsi,
5727 			       vec<tree> *result_chain)
5728 {
5729   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5730   tree perm2_mask1, perm2_mask2, perm3_mask;
5731   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5732   gimple *perm_stmt;
5733 
5734   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5735   unsigned int i;
5736   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5737   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5738   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5739   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5740 
5741   result_chain->quick_grow (length);
5742   memcpy (result_chain->address (), dr_chain.address (),
5743 	  length * sizeof (tree));
5744 
5745   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5746     {
5747       unsigned int j, log_length = exact_log2 (length);
5748       for (i = 0; i < nelt / 2; ++i)
5749 	sel[i] = i * 2;
5750       for (i = 0; i < nelt / 2; ++i)
5751 	sel[nelt / 2 + i] = i * 2 + 1;
5752       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5753 	{
5754 	  if (dump_enabled_p ())
5755 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5756 			     "shuffle of 2 fields structure is not \
5757 			      supported by target\n");
5758 	  return false;
5759 	}
5760       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5761 
5762       for (i = 0; i < nelt / 2; ++i)
5763 	sel[i] = i * 2 + 1;
5764       for (i = 0; i < nelt / 2; ++i)
5765 	sel[nelt / 2 + i] = i * 2;
5766       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5767 	{
5768 	  if (dump_enabled_p ())
5769 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5770 			     "shuffle of 2 fields structure is not \
5771 			      supported by target\n");
5772 	  return false;
5773 	}
5774       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5775 
5776       /* Generating permutation constant to shift all elements.
5777 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5778       for (i = 0; i < nelt; i++)
5779 	sel[i] = nelt / 2 + i;
5780       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5781 	{
5782 	  if (dump_enabled_p ())
5783 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5784 			     "shift permutation is not supported by target\n");
5785 	  return false;
5786 	}
5787       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5788 
5789       /* Generating permutation constant to select vector from 2.
5790 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5791       for (i = 0; i < nelt / 2; i++)
5792 	sel[i] = i;
5793       for (i = nelt / 2; i < nelt; i++)
5794 	sel[i] = nelt + i;
5795       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5796 	{
5797 	  if (dump_enabled_p ())
5798 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5799 			     "select is not supported by target\n");
5800 	  return false;
5801 	}
5802       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5803 
5804       for (i = 0; i < log_length; i++)
5805 	{
5806 	  for (j = 0; j < length; j += 2)
5807 	    {
5808 	      first_vect = dr_chain[j];
5809 	      second_vect = dr_chain[j + 1];
5810 
5811 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5812 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5813 					       first_vect, first_vect,
5814 					       perm2_mask1);
5815 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5816 	      vect[0] = data_ref;
5817 
5818 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5819 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5820 					       second_vect, second_vect,
5821 					       perm2_mask2);
5822 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5823 	      vect[1] = data_ref;
5824 
5825 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5826 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5827 					       vect[0], vect[1], shift1_mask);
5828 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5829 	      (*result_chain)[j/2 + length/2] = data_ref;
5830 
5831 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5832 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5833 					       vect[0], vect[1], select_mask);
5834 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5835 	      (*result_chain)[j/2] = data_ref;
5836 	    }
5837 	  memcpy (dr_chain.address (), result_chain->address (),
5838 		  length * sizeof (tree));
5839 	}
5840       return true;
5841     }
5842   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5843     {
5844       unsigned int k = 0, l = 0;
5845 
5846       /* Generating permutation constant to get all elements in rigth order.
5847 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5848       for (i = 0; i < nelt; i++)
5849 	{
5850 	  if (3 * k + (l % 3) >= nelt)
5851 	    {
5852 	      k = 0;
5853 	      l += (3 - (nelt % 3));
5854 	    }
5855 	  sel[i] = 3 * k + (l % 3);
5856 	  k++;
5857 	}
5858       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5859 	{
5860 	  if (dump_enabled_p ())
5861 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5862 			     "shuffle of 3 fields structure is not \
5863 			      supported by target\n");
5864 	  return false;
5865 	}
5866       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5867 
5868       /* Generating permutation constant to shift all elements.
5869 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5870       for (i = 0; i < nelt; i++)
5871 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5872       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5873 	{
5874 	  if (dump_enabled_p ())
5875 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5876 			     "shift permutation is not supported by target\n");
5877 	  return false;
5878 	}
5879       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5880 
5881       /* Generating permutation constant to shift all elements.
5882 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5883       for (i = 0; i < nelt; i++)
5884 	sel[i] = 2 * (nelt / 3) + 1 + i;
5885       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5886 	{
5887 	  if (dump_enabled_p ())
5888 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5889 			     "shift permutation is not supported by target\n");
5890 	  return false;
5891 	}
5892       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5893 
5894       /* Generating permutation constant to shift all elements.
5895 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5896       for (i = 0; i < nelt; i++)
5897 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5898       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5899 	{
5900 	  if (dump_enabled_p ())
5901 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5902 			     "shift permutation is not supported by target\n");
5903 	  return false;
5904 	}
5905       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5906 
5907       /* Generating permutation constant to shift all elements.
5908 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5909       for (i = 0; i < nelt; i++)
5910 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5911       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5912 	{
5913 	  if (dump_enabled_p ())
5914 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5915 			     "shift permutation is not supported by target\n");
5916 	  return false;
5917 	}
5918       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5919 
5920       for (k = 0; k < 3; k++)
5921 	{
5922 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5923 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5924 					   dr_chain[k], dr_chain[k],
5925 					   perm3_mask);
5926 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5927 	  vect[k] = data_ref;
5928 	}
5929 
5930       for (k = 0; k < 3; k++)
5931 	{
5932 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5933 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5934 					   vect[k % 3], vect[(k + 1) % 3],
5935 					   shift1_mask);
5936 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5937 	  vect_shift[k] = data_ref;
5938 	}
5939 
5940       for (k = 0; k < 3; k++)
5941 	{
5942 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5943 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5944 					   vect_shift[(4 - k) % 3],
5945 					   vect_shift[(3 - k) % 3],
5946 					   shift2_mask);
5947 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5948 	  vect[k] = data_ref;
5949 	}
5950 
5951       (*result_chain)[3 - (nelt % 3)] = vect[2];
5952 
5953       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5954       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5955 				       vect[0], shift3_mask);
5956       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5957       (*result_chain)[nelt % 3] = data_ref;
5958 
5959       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5960       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5961 				       vect[1], shift4_mask);
5962       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5963       (*result_chain)[0] = data_ref;
5964       return true;
5965     }
5966   return false;
5967 }
5968 
5969 /* Function vect_transform_grouped_load.
5970 
5971    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5972    to perform their permutation and ascribe the result vectorized statements to
5973    the scalar statements.
5974 */
5975 
5976 void
5977 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5978 			     gimple_stmt_iterator *gsi)
5979 {
5980   machine_mode mode;
5981   vec<tree> result_chain = vNULL;
5982 
5983   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5984      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5985      vectors, that are ready for vector computation.  */
5986   result_chain.create (size);
5987 
5988   /* If reassociation width for vector type is 2 or greater target machine can
5989      execute 2 or more vector instructions in parallel.  Otherwise try to
5990      get chain for loads group using vect_shift_permute_load_chain.  */
5991   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5992   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5993       || pow2p_hwi (size)
5994       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5995 					 gsi, &result_chain))
5996     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5997   vect_record_grouped_load_vectors (stmt, result_chain);
5998   result_chain.release ();
5999 }
6000 
6001 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6002    generated as part of the vectorization of STMT.  Assign the statement
6003    for each vector to the associated scalar statement.  */
6004 
6005 void
6006 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6007 {
6008   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6009   gimple *next_stmt, *new_stmt;
6010   unsigned int i, gap_count;
6011   tree tmp_data_ref;
6012 
6013   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6014      Since we scan the chain starting from it's first node, their order
6015      corresponds the order of data-refs in RESULT_CHAIN.  */
6016   next_stmt = first_stmt;
6017   gap_count = 1;
6018   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6019     {
6020       if (!next_stmt)
6021 	break;
6022 
6023       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6024        code elimination pass later.  No need to check for the first stmt in
6025        the group, since it always exists.
6026        GROUP_GAP is the number of steps in elements from the previous
6027        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6028        correspond to the gaps.  */
6029       if (next_stmt != first_stmt
6030           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6031       {
6032         gap_count++;
6033         continue;
6034       }
6035 
6036       while (next_stmt)
6037         {
6038 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6039 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6040 	     copies, and we put the new vector statement in the first available
6041 	     RELATED_STMT.  */
6042 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6043 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6044 	  else
6045             {
6046               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6047                 {
6048 		  gimple *prev_stmt =
6049 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6050 		  gimple *rel_stmt =
6051 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6052 	          while (rel_stmt)
6053 		    {
6054 		      prev_stmt = rel_stmt;
6055 		      rel_stmt =
6056                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6057 		    }
6058 
6059   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6060                     new_stmt;
6061                 }
6062             }
6063 
6064 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6065 	  gap_count = 1;
6066 	  /* If NEXT_STMT accesses the same DR as the previous statement,
6067 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
6068 	     get the next data-ref from RESULT_CHAIN.  */
6069 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6070 	    break;
6071         }
6072     }
6073 }
6074 
6075 /* Function vect_force_dr_alignment_p.
6076 
6077    Returns whether the alignment of a DECL can be forced to be aligned
6078    on ALIGNMENT bit boundary.  */
6079 
6080 bool
6081 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6082 {
6083   if (!VAR_P (decl))
6084     return false;
6085 
6086   if (decl_in_symtab_p (decl)
6087       && !symtab_node::get (decl)->can_increase_alignment_p ())
6088     return false;
6089 
6090   if (TREE_STATIC (decl))
6091     return (alignment <= MAX_OFILE_ALIGNMENT);
6092   else
6093     return (alignment <= MAX_STACK_ALIGNMENT);
6094 }
6095 
6096 
6097 /* Return whether the data reference DR is supported with respect to its
6098    alignment.
6099    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6100    it is aligned, i.e., check if it is possible to vectorize it with different
6101    alignment.  */
6102 
6103 enum dr_alignment_support
6104 vect_supportable_dr_alignment (struct data_reference *dr,
6105                                bool check_aligned_accesses)
6106 {
6107   gimple *stmt = DR_STMT (dr);
6108   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6109   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6110   machine_mode mode = TYPE_MODE (vectype);
6111   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6112   struct loop *vect_loop = NULL;
6113   bool nested_in_vect_loop = false;
6114 
6115   if (aligned_access_p (dr) && !check_aligned_accesses)
6116     return dr_aligned;
6117 
6118   /* For now assume all conditional loads/stores support unaligned
6119      access without any special code.  */
6120   if (is_gimple_call (stmt)
6121       && gimple_call_internal_p (stmt)
6122       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6123 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6124     return dr_unaligned_supported;
6125 
6126   if (loop_vinfo)
6127     {
6128       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6129       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6130     }
6131 
6132   /* Possibly unaligned access.  */
6133 
6134   /* We can choose between using the implicit realignment scheme (generating
6135      a misaligned_move stmt) and the explicit realignment scheme (generating
6136      aligned loads with a REALIGN_LOAD).  There are two variants to the
6137      explicit realignment scheme: optimized, and unoptimized.
6138      We can optimize the realignment only if the step between consecutive
6139      vector loads is equal to the vector size.  Since the vector memory
6140      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6141      is guaranteed that the misalignment amount remains the same throughout the
6142      execution of the vectorized loop.  Therefore, we can create the
6143      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6144      at the loop preheader.
6145 
6146      However, in the case of outer-loop vectorization, when vectorizing a
6147      memory access in the inner-loop nested within the LOOP that is now being
6148      vectorized, while it is guaranteed that the misalignment of the
6149      vectorized memory access will remain the same in different outer-loop
6150      iterations, it is *not* guaranteed that is will remain the same throughout
6151      the execution of the inner-loop.  This is because the inner-loop advances
6152      with the original scalar step (and not in steps of VS).  If the inner-loop
6153      step happens to be a multiple of VS, then the misalignment remains fixed
6154      and we can use the optimized realignment scheme.  For example:
6155 
6156       for (i=0; i<N; i++)
6157         for (j=0; j<M; j++)
6158           s += a[i+j];
6159 
6160      When vectorizing the i-loop in the above example, the step between
6161      consecutive vector loads is 1, and so the misalignment does not remain
6162      fixed across the execution of the inner-loop, and the realignment cannot
6163      be optimized (as illustrated in the following pseudo vectorized loop):
6164 
6165       for (i=0; i<N; i+=4)
6166         for (j=0; j<M; j++){
6167           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6168                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6169                          // (assuming that we start from an aligned address).
6170           }
6171 
6172      We therefore have to use the unoptimized realignment scheme:
6173 
6174       for (i=0; i<N; i+=4)
6175           for (j=k; j<M; j+=4)
6176           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6177                            // that the misalignment of the initial address is
6178                            // 0).
6179 
6180      The loop can then be vectorized as follows:
6181 
6182       for (k=0; k<4; k++){
6183         rt = get_realignment_token (&vp[k]);
6184         for (i=0; i<N; i+=4){
6185           v1 = vp[i+k];
6186           for (j=k; j<M; j+=4){
6187             v2 = vp[i+j+VS-1];
6188             va = REALIGN_LOAD <v1,v2,rt>;
6189             vs += va;
6190             v1 = v2;
6191           }
6192         }
6193     } */
6194 
6195   if (DR_IS_READ (dr))
6196     {
6197       bool is_packed = false;
6198       tree type = (TREE_TYPE (DR_REF (dr)));
6199 
6200       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6201 	  && (!targetm.vectorize.builtin_mask_for_load
6202 	      || targetm.vectorize.builtin_mask_for_load ()))
6203 	{
6204 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6205 
6206 	  /* If we are doing SLP then the accesses need not have the
6207 	     same alignment, instead it depends on the SLP group size.  */
6208 	  if (loop_vinfo
6209 	      && STMT_SLP_TYPE (stmt_info)
6210 	      && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6211 		  * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6212 		  % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6213 	    ;
6214 	  else if (!loop_vinfo
6215 		   || (nested_in_vect_loop
6216 		       && (TREE_INT_CST_LOW (DR_STEP (dr))
6217 			   != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6218 	    return dr_explicit_realign;
6219 	  else
6220 	    return dr_explicit_realign_optimized;
6221 	}
6222       if (!known_alignment_for_access_p (dr))
6223 	is_packed = not_size_aligned (DR_REF (dr));
6224 
6225       if (targetm.vectorize.support_vector_misalignment
6226 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6227 	/* Can't software pipeline the loads, but can at least do them.  */
6228 	return dr_unaligned_supported;
6229     }
6230   else
6231     {
6232       bool is_packed = false;
6233       tree type = (TREE_TYPE (DR_REF (dr)));
6234 
6235       if (!known_alignment_for_access_p (dr))
6236 	is_packed = not_size_aligned (DR_REF (dr));
6237 
6238      if (targetm.vectorize.support_vector_misalignment
6239 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6240        return dr_unaligned_supported;
6241     }
6242 
6243   /* Unsupported.  */
6244   return dr_unaligned_unsupported;
6245 }
6246