xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-data-refs.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
52 
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
55 
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 			      tree vectype, unsigned HOST_WIDE_INT count)
59 {
60   machine_mode mode, array_mode;
61   bool limit_p;
62 
63   mode = TYPE_MODE (vectype);
64   limit_p = !targetm.array_mode_supported_p (mode, count);
65   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 			      MODE_INT, limit_p);
67 
68   if (array_mode == BLKmode)
69     {
70       if (dump_enabled_p ())
71 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73                          GET_MODE_NAME (mode), count);
74       return false;
75     }
76 
77   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
78     {
79       if (dump_enabled_p ())
80 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81                          "cannot use %s<%s><%s>\n", name,
82                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83       return false;
84     }
85 
86   if (dump_enabled_p ())
87     dump_printf_loc (MSG_NOTE, vect_location,
88                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89                      GET_MODE_NAME (mode));
90 
91   return true;
92 }
93 
94 
95 /* Return the smallest scalar part of STMT.
96    This is used to determine the vectype of the stmt.  We generally set the
97    vectype according to the type of the result (lhs).  For stmts whose
98    result-type is different than the type of the arguments (e.g., demotion,
99    promotion), vectype will be reset appropriately (later).  Note that we have
100    to visit the smallest datatype in this function, because that determines the
101    VF.  If the smallest datatype in the loop is present only as the rhs of a
102    promotion operation - we'd miss it.
103    Such a case, where a variable of this datatype does not appear in the lhs
104    anywhere in the loop, can only occur if it's an invariant: e.g.:
105    'int_x = (int) short_inv', which we'd expect to have been optimized away by
106    invariant motion.  However, we cannot rely on invariant motion to always
107    take invariants out of the loop, and so in the case of promotion we also
108    have to check the rhs.
109    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110    types.  */
111 
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114                                HOST_WIDE_INT *rhs_size_unit)
115 {
116   tree scalar_type = gimple_expr_type (stmt);
117   HOST_WIDE_INT lhs, rhs;
118 
119   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
120 
121   if (is_gimple_assign (stmt)
122       && (gimple_assign_cast_p (stmt)
123           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
126     {
127       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
128 
129       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130       if (rhs < lhs)
131         scalar_type = rhs_type;
132     }
133 
134   *lhs_size_unit = lhs;
135   *rhs_size_unit = rhs;
136   return scalar_type;
137 }
138 
139 
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141    tested at run-time.  Return TRUE if DDR was successfully inserted.
142    Return false if versioning is not supported.  */
143 
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
146 {
147   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
148 
149   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150     return false;
151 
152   if (dump_enabled_p ())
153     {
154       dump_printf_loc (MSG_NOTE, vect_location,
155                        "mark for run-time aliasing test between ");
156       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157       dump_printf (MSG_NOTE,  " and ");
158       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159       dump_printf (MSG_NOTE, "\n");
160     }
161 
162   if (optimize_loop_nest_for_size_p (loop))
163     {
164       if (dump_enabled_p ())
165 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166                          "versioning not supported when optimizing"
167                          " for size.\n");
168       return false;
169     }
170 
171   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
172   if (loop->inner)
173     {
174       if (dump_enabled_p ())
175 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176                          "versioning not yet supported for outer-loops.\n");
177       return false;
178     }
179 
180   /* FORNOW: We don't support creating runtime alias tests for non-constant
181      step.  */
182   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
184     {
185       if (dump_enabled_p ())
186 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187                          "versioning not yet supported for non-constant "
188                          "step\n");
189       return false;
190     }
191 
192   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193   return true;
194 }
195 
196 
197 /* Function vect_analyze_data_ref_dependence.
198 
199    Return TRUE if there (might) exist a dependence between a memory-reference
200    DRA and a memory-reference DRB.  When versioning for alias may check a
201    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
202    the data dependence.  */
203 
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206                                   loop_vec_info loop_vinfo, int *max_vf)
207 {
208   unsigned int i;
209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210   struct data_reference *dra = DDR_A (ddr);
211   struct data_reference *drb = DDR_B (ddr);
212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214   lambda_vector dist_v;
215   unsigned int loop_depth;
216 
217   /* In loop analysis all data references should be vectorizable.  */
218   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220     gcc_unreachable ();
221 
222   /* Independent data accesses.  */
223   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224     return false;
225 
226   if (dra == drb
227       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228     return false;
229 
230   /* Even if we have an anti-dependence then, as the vectorized loop covers at
231      least two scalar iterations, there is always also a true dependence.
232      As the vectorizer does not re-order loads and stores we can ignore
233      the anti-dependence if TBAA can disambiguate both DRs similar to the
234      case with known negative distance anti-dependences (positive
235      distance anti-dependences would violate TBAA constraints).  */
236   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 				 get_alias_set (DR_REF (drb))))
240     return false;
241 
242   /* Unknown data dependence.  */
243   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
244     {
245       /* If user asserted safelen consecutive iterations can be
246 	 executed concurrently, assume independence.  */
247       if (loop->safelen >= 2)
248 	{
249 	  if (loop->safelen < *max_vf)
250 	    *max_vf = loop->safelen;
251 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 	  return false;
253 	}
254 
255       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
257 	{
258 	  if (dump_enabled_p ())
259 	    {
260 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 			       "versioning for alias not supported for: "
262 			       "can't determine dependence between ");
263 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 				 DR_REF (dra));
265 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 				 DR_REF (drb));
268 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
269 	    }
270 	  return true;
271 	}
272 
273       if (dump_enabled_p ())
274 	{
275 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 			   "versioning for alias required: "
277 			   "can't determine dependence between ");
278 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 			     DR_REF (dra));
280 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 			     DR_REF (drb));
283 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
284 	}
285 
286       /* Add to list of ddrs that need to be tested at run-time.  */
287       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
288     }
289 
290   /* Known data dependence.  */
291   if (DDR_NUM_DIST_VECTS (ddr) == 0)
292     {
293       /* If user asserted safelen consecutive iterations can be
294 	 executed concurrently, assume independence.  */
295       if (loop->safelen >= 2)
296 	{
297 	  if (loop->safelen < *max_vf)
298 	    *max_vf = loop->safelen;
299 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 	  return false;
301 	}
302 
303       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
305 	{
306 	  if (dump_enabled_p ())
307 	    {
308 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 			       "versioning for alias not supported for: "
310 			       "bad dist vector for ");
311 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 				 DR_REF (dra));
313 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 				 DR_REF (drb));
316 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 	    }
318 	  return true;
319 	}
320 
321       if (dump_enabled_p ())
322         {
323           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324                            "versioning for alias required: "
325                            "bad dist vector for ");
326           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
328           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
330         }
331       /* Add to list of ddrs that need to be tested at run-time.  */
332       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
333     }
334 
335   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
337     {
338       int dist = dist_v[loop_depth];
339 
340       if (dump_enabled_p ())
341 	dump_printf_loc (MSG_NOTE, vect_location,
342                          "dependence distance  = %d.\n", dist);
343 
344       if (dist == 0)
345 	{
346 	  if (dump_enabled_p ())
347 	    {
348 	      dump_printf_loc (MSG_NOTE, vect_location,
349 	                       "dependence distance == 0 between ");
350 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 	      dump_printf (MSG_NOTE, " and ");
352 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
354 	    }
355 
356 	  /* When we perform grouped accesses and perform implicit CSE
357 	     by detecting equal accesses and doing disambiguation with
358 	     runtime alias tests like for
359 	        .. = a[i];
360 		.. = a[i+1];
361 		a[i] = ..;
362 		a[i+1] = ..;
363 		*p = ..;
364 		.. = a[i];
365 		.. = a[i+1];
366 	     where we will end up loading { a[i], a[i+1] } once, make
367 	     sure that inserting group loads before the first load and
368 	     stores after the last store will do the right thing.
369 	     Similar for groups like
370 	        a[i] = ...;
371 		... = a[i];
372 		a[i+1] = ...;
373 	     where loads from the group interleave with the store.  */
374 	  if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 	      || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
376 	    {
377 	      gimple *earlier_stmt;
378 	      earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 	      if (DR_IS_WRITE
380 		    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
381 		{
382 		  if (dump_enabled_p ())
383 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 				     "READ_WRITE dependence in interleaving."
385 				     "\n");
386 		  return true;
387 		}
388 	    }
389 
390 	  continue;
391 	}
392 
393       if (dist > 0 && DDR_REVERSED_P (ddr))
394 	{
395 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 	     reversed (to make distance vector positive), and the actual
397 	     distance is negative.  */
398 	  if (dump_enabled_p ())
399 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 	                     "dependence distance negative.\n");
401 	  /* Record a negative dependence distance to later limit the
402 	     amount of stmt copying / unrolling we can perform.
403 	     Only need to handle read-after-write dependence.  */
404 	  if (DR_IS_READ (drb)
405 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 	  continue;
409 	}
410 
411       if (abs (dist) >= 2
412 	  && abs (dist) < *max_vf)
413 	{
414 	  /* The dependence distance requires reduction of the maximal
415 	     vectorization factor.  */
416 	  *max_vf = abs (dist);
417 	  if (dump_enabled_p ())
418 	    dump_printf_loc (MSG_NOTE, vect_location,
419 	                     "adjusting maximal vectorization factor to %i\n",
420 	                     *max_vf);
421 	}
422 
423       if (abs (dist) >= *max_vf)
424 	{
425 	  /* Dependence distance does not create dependence, as far as
426 	     vectorization is concerned, in this case.  */
427 	  if (dump_enabled_p ())
428 	    dump_printf_loc (MSG_NOTE, vect_location,
429 	                     "dependence distance >= VF.\n");
430 	  continue;
431 	}
432 
433       if (dump_enabled_p ())
434 	{
435 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 	               "not vectorized, possible dependence "
437 	               "between data-refs ");
438 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 	  dump_printf (MSG_NOTE,  " and ");
440 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 	  dump_printf (MSG_NOTE,  "\n");
442 	}
443 
444       return true;
445     }
446 
447   return false;
448 }
449 
450 /* Function vect_analyze_data_ref_dependences.
451 
452    Examine all the data references in the loop, and make sure there do not
453    exist any data dependences between them.  Set *MAX_VF according to
454    the maximum vectorization factor the data dependences allow.  */
455 
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
458 {
459   unsigned int i;
460   struct data_dependence_relation *ddr;
461 
462   if (dump_enabled_p ())
463     dump_printf_loc (MSG_NOTE, vect_location,
464                      "=== vect_analyze_data_ref_dependences ===\n");
465 
466   LOOP_VINFO_DDRS (loop_vinfo)
467     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 	     * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 				&LOOP_VINFO_DDRS (loop_vinfo),
472 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473     return false;
474 
475   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477       return false;
478 
479   return true;
480 }
481 
482 
483 /* Function vect_slp_analyze_data_ref_dependence.
484 
485    Return TRUE if there (might) exist a dependence between a memory-reference
486    DRA and a memory-reference DRB.  When versioning for alias may check a
487    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
488    the data dependence.  */
489 
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
492 {
493   struct data_reference *dra = DDR_A (ddr);
494   struct data_reference *drb = DDR_B (ddr);
495 
496   /* We need to check dependences of statements marked as unvectorizable
497      as well, they still can prohibit vectorization.  */
498 
499   /* Independent data accesses.  */
500   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501     return false;
502 
503   if (dra == drb)
504     return false;
505 
506   /* Read-read is OK.  */
507   if (DR_IS_READ (dra) && DR_IS_READ (drb))
508     return false;
509 
510   /* If dra and drb are part of the same interleaving chain consider
511      them independent.  */
512   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515     return false;
516 
517   /* Unknown data dependence.  */
518   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
519     {
520       if  (dump_enabled_p ())
521 	{
522 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 			   "can't determine dependence between ");
524 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
526 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
528 	}
529     }
530   else if (dump_enabled_p ())
531     {
532       dump_printf_loc (MSG_NOTE, vect_location,
533 		       "determined dependence between ");
534       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535       dump_printf (MSG_NOTE, " and ");
536       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537       dump_printf (MSG_NOTE,  "\n");
538     }
539 
540   return true;
541 }
542 
543 
544 /* Analyze dependences involved in the transform of SLP NODE.  STORES
545    contain the vector of scalar stores of this instance if we are
546    disambiguating the loads.  */
547 
548 static bool
549 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
550 				   vec<gimple *> stores, gimple *last_store)
551 {
552   /* This walks over all stmts involved in the SLP load/store done
553      in NODE verifying we can sink them up to the last stmt in the
554      group.  */
555   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
556   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
557     {
558       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
559       if (access == last_access)
560 	continue;
561       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
562       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
563 	   gsi_stmt (gsi) != last_access; gsi_next (&gsi))
564 	{
565 	  gimple *stmt = gsi_stmt (gsi);
566 	  if (! gimple_vuse (stmt)
567 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
568 	    continue;
569 
570 	  /* If we couldn't record a (single) data reference for this
571 	     stmt we have to give up.  */
572 	  /* ???  Here and below if dependence analysis fails we can resort
573 	     to the alias oracle which can handle more kinds of stmts.  */
574 	  data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
575 	  if (!dr_b)
576 	    return false;
577 
578 	  /* If we run into a store of this same instance (we've just
579 	     marked those) then delay dependence checking until we run
580 	     into the last store because this is where it will have
581 	     been sunk to (and we verify if we can do that as well).  */
582 	  if (gimple_visited_p (stmt))
583 	    {
584 	      if (stmt != last_store)
585 		continue;
586 	      unsigned i;
587 	      gimple *store;
588 	      FOR_EACH_VEC_ELT (stores, i, store)
589 		{
590 		  data_reference *store_dr
591 		    = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
592 		  ddr_p ddr = initialize_data_dependence_relation
593 				(dr_a, store_dr, vNULL);
594 		  if (vect_slp_analyze_data_ref_dependence (ddr))
595 		    {
596 		      free_dependence_relation (ddr);
597 		      return false;
598 		    }
599 		  free_dependence_relation (ddr);
600 		}
601 	    }
602 
603 	  ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
604 	  if (vect_slp_analyze_data_ref_dependence (ddr))
605 	    {
606 	      free_dependence_relation (ddr);
607 	      return false;
608 	    }
609 	  free_dependence_relation (ddr);
610 	}
611     }
612   return true;
613 }
614 
615 
616 /* Function vect_analyze_data_ref_dependences.
617 
618    Examine all the data references in the basic-block, and make sure there
619    do not exist any data dependences between them.  Set *MAX_VF according to
620    the maximum vectorization factor the data dependences allow.  */
621 
622 bool
623 vect_slp_analyze_instance_dependence (slp_instance instance)
624 {
625   if (dump_enabled_p ())
626     dump_printf_loc (MSG_NOTE, vect_location,
627                      "=== vect_slp_analyze_instance_dependence ===\n");
628 
629   /* The stores of this instance are at the root of the SLP tree.  */
630   slp_tree store = SLP_INSTANCE_TREE (instance);
631   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
632     store = NULL;
633 
634   /* Verify we can sink stores to the vectorized stmt insert location.  */
635   gimple *last_store = NULL;
636   if (store)
637     {
638       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
639 	return false;
640 
641       /* Mark stores in this instance and remember the last one.  */
642       last_store = vect_find_last_scalar_stmt_in_slp (store);
643       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
644 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
645     }
646 
647   bool res = true;
648 
649   /* Verify we can sink loads to the vectorized stmt insert location,
650      special-casing stores of this instance.  */
651   slp_tree load;
652   unsigned int i;
653   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
654     if (! vect_slp_analyze_node_dependences (instance, load,
655 					     store
656 					     ? SLP_TREE_SCALAR_STMTS (store)
657 					     : vNULL, last_store))
658       {
659 	res = false;
660 	break;
661       }
662 
663   /* Unset the visited flag.  */
664   if (store)
665     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
666       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
667 
668   return res;
669 }
670 
671 /* Function vect_compute_data_ref_alignment
672 
673    Compute the misalignment of the data reference DR.
674 
675    Output:
676    1. If during the misalignment computation it is found that the data reference
677       cannot be vectorized then false is returned.
678    2. DR_MISALIGNMENT (DR) is defined.
679 
680    FOR NOW: No analysis is actually performed. Misalignment is calculated
681    only for trivial cases. TODO.  */
682 
683 bool
684 vect_compute_data_ref_alignment (struct data_reference *dr)
685 {
686   gimple *stmt = DR_STMT (dr);
687   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
688   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
689   struct loop *loop = NULL;
690   tree ref = DR_REF (dr);
691   tree vectype;
692   tree base, base_addr;
693   tree misalign = NULL_TREE;
694   tree aligned_to;
695   tree step;
696   unsigned HOST_WIDE_INT alignment;
697 
698   if (dump_enabled_p ())
699     dump_printf_loc (MSG_NOTE, vect_location,
700                      "vect_compute_data_ref_alignment:\n");
701 
702   if (loop_vinfo)
703     loop = LOOP_VINFO_LOOP (loop_vinfo);
704 
705   /* Initialize misalignment to unknown.  */
706   SET_DR_MISALIGNMENT (dr, -1);
707 
708   if (tree_fits_shwi_p (DR_STEP (dr)))
709     misalign = DR_INIT (dr);
710   aligned_to = DR_ALIGNED_TO (dr);
711   base_addr = DR_BASE_ADDRESS (dr);
712   vectype = STMT_VINFO_VECTYPE (stmt_info);
713 
714   /* In case the dataref is in an inner-loop of the loop that is being
715      vectorized (LOOP), we use the base and misalignment information
716      relative to the outer-loop (LOOP).  This is ok only if the misalignment
717      stays the same throughout the execution of the inner-loop, which is why
718      we have to check that the stride of the dataref in the inner-loop evenly
719      divides by the vector size.  */
720   if (loop && nested_in_vect_loop_p (loop, stmt))
721     {
722       tree step = DR_STEP (dr);
723 
724       if (tree_fits_shwi_p (step)
725 	  && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
726         {
727           if (dump_enabled_p ())
728             dump_printf_loc (MSG_NOTE, vect_location,
729                              "inner step divides the vector-size.\n");
730 	  misalign = STMT_VINFO_DR_INIT (stmt_info);
731 	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
732 	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
733         }
734       else
735 	{
736 	  if (dump_enabled_p ())
737 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 	                     "inner step doesn't divide the vector-size.\n");
739 	  misalign = NULL_TREE;
740 	}
741     }
742 
743   /* Similarly we can only use base and misalignment information relative to
744      an innermost loop if the misalignment stays the same throughout the
745      execution of the loop.  As above, this is the case if the stride of
746      the dataref evenly divides by the vector size.  */
747   else
748     {
749       tree step = DR_STEP (dr);
750       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
751 
752       if (tree_fits_shwi_p (step)
753 	  && ((tree_to_shwi (step) * vf)
754 	      % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
755 	{
756 	  if (dump_enabled_p ())
757 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 	                     "step doesn't divide the vector-size.\n");
759 	  misalign = NULL_TREE;
760 	}
761     }
762 
763   /* To look at alignment of the base we have to preserve an inner MEM_REF
764      as that carries alignment information of the actual access.  */
765   base = ref;
766   while (handled_component_p (base))
767     base = TREE_OPERAND (base, 0);
768   unsigned int base_alignment = 0;
769   unsigned HOST_WIDE_INT base_bitpos;
770   get_object_alignment_1 (base, &base_alignment, &base_bitpos);
771   /* As data-ref analysis strips the MEM_REF down to its base operand
772      to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
773      adjust things to make base_alignment valid as the alignment of
774      DR_BASE_ADDRESS.  */
775   if (TREE_CODE (base) == MEM_REF)
776     {
777       /* Note all this only works if DR_BASE_ADDRESS is the same as
778 	 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
779 	 in other offsets.  We need to rework DR to compute the alingment
780 	 of DR_BASE_ADDRESS as long as all information is still available.  */
781       if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
782 	{
783 	  base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
784 	  base_bitpos &= (base_alignment - 1);
785 	}
786       else
787 	base_bitpos = BITS_PER_UNIT;
788     }
789   if (base_bitpos != 0)
790     base_alignment = base_bitpos & -base_bitpos;
791   /* Also look at the alignment of the base address DR analysis
792      computed.  */
793   unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
794   if (base_addr_alignment > base_alignment)
795     base_alignment = base_addr_alignment;
796 
797   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
798     DR_VECT_AUX (dr)->base_element_aligned = true;
799 
800   alignment = TYPE_ALIGN_UNIT (vectype);
801 
802   if ((compare_tree_int (aligned_to, alignment) < 0)
803       || !misalign)
804     {
805       if (dump_enabled_p ())
806 	{
807 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
808 	                   "Unknown alignment for access: ");
809 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
810 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
811 	}
812       return true;
813     }
814 
815   if (base_alignment < TYPE_ALIGN (vectype))
816     {
817       base = base_addr;
818       if (TREE_CODE (base) == ADDR_EXPR)
819 	base = TREE_OPERAND (base, 0);
820       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
821 	{
822 	  if (dump_enabled_p ())
823 	    {
824 	      dump_printf_loc (MSG_NOTE, vect_location,
825 	                       "can't force alignment of ref: ");
826 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
827 	      dump_printf (MSG_NOTE, "\n");
828 	    }
829 	  return true;
830 	}
831 
832       /* Force the alignment of the decl.
833 	 NOTE: This is the only change to the code we make during
834 	 the analysis phase, before deciding to vectorize the loop.  */
835       if (dump_enabled_p ())
836         {
837           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
838           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
839           dump_printf (MSG_NOTE, "\n");
840         }
841 
842       DR_VECT_AUX (dr)->base_decl = base;
843       DR_VECT_AUX (dr)->base_misaligned = true;
844       DR_VECT_AUX (dr)->base_element_aligned = true;
845     }
846 
847   if (loop && nested_in_vect_loop_p (loop, stmt))
848     step = STMT_VINFO_DR_STEP (stmt_info);
849   else
850     step = DR_STEP (dr);
851   /* If this is a backward running DR then first access in the larger
852      vectype actually is N-1 elements before the address in the DR.
853      Adjust misalign accordingly.  */
854   if (tree_int_cst_sgn (step) < 0)
855     {
856       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
857       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
858 	 otherwise we wouldn't be here.  */
859       offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
860       /* PLUS because STEP was negative.  */
861       misalign = size_binop (PLUS_EXPR, misalign, offset);
862     }
863 
864   SET_DR_MISALIGNMENT (dr,
865 		       wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
866 
867   if (dump_enabled_p ())
868     {
869       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
871       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
872       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
873     }
874 
875   return true;
876 }
877 
878 
879 /* Function vect_update_misalignment_for_peel
880 
881    DR - the data reference whose misalignment is to be adjusted.
882    DR_PEEL - the data reference whose misalignment is being made
883              zero in the vector loop by the peel.
884    NPEEL - the number of iterations in the peel loop if the misalignment
885            of DR_PEEL is known at compile time.  */
886 
887 static void
888 vect_update_misalignment_for_peel (struct data_reference *dr,
889                                    struct data_reference *dr_peel, int npeel)
890 {
891   unsigned int i;
892   vec<dr_p> same_align_drs;
893   struct data_reference *current_dr;
894   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
895   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
896   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
897   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
898 
899  /* For interleaved data accesses the step in the loop must be multiplied by
900      the size of the interleaving group.  */
901   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
902     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
903   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
904     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
905 
906   /* It can be assumed that the data refs with the same alignment as dr_peel
907      are aligned in the vector loop.  */
908   same_align_drs
909     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
910   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
911     {
912       if (current_dr != dr)
913         continue;
914       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
915                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
916       SET_DR_MISALIGNMENT (dr, 0);
917       return;
918     }
919 
920   if (known_alignment_for_access_p (dr)
921       && known_alignment_for_access_p (dr_peel))
922     {
923       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
924       int misal = DR_MISALIGNMENT (dr);
925       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
926       misal += negative ? -npeel * dr_size : npeel * dr_size;
927       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
928       SET_DR_MISALIGNMENT (dr, misal);
929       return;
930     }
931 
932   if (dump_enabled_p ())
933     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
934   SET_DR_MISALIGNMENT (dr, -1);
935 }
936 
937 
938 /* Function verify_data_ref_alignment
939 
940    Return TRUE if DR can be handled with respect to alignment.  */
941 
942 static bool
943 verify_data_ref_alignment (data_reference_p dr)
944 {
945   enum dr_alignment_support supportable_dr_alignment
946     = vect_supportable_dr_alignment (dr, false);
947   if (!supportable_dr_alignment)
948     {
949       if (dump_enabled_p ())
950 	{
951 	  if (DR_IS_READ (dr))
952 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
953 			     "not vectorized: unsupported unaligned load.");
954 	  else
955 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
956 			     "not vectorized: unsupported unaligned "
957 			     "store.");
958 
959 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
960 			     DR_REF (dr));
961 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
962 	}
963       return false;
964     }
965 
966   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
967     dump_printf_loc (MSG_NOTE, vect_location,
968 		     "Vectorizing an unaligned access.\n");
969 
970   return true;
971 }
972 
973 /* Function vect_verify_datarefs_alignment
974 
975    Return TRUE if all data references in the loop can be
976    handled with respect to alignment.  */
977 
978 bool
979 vect_verify_datarefs_alignment (loop_vec_info vinfo)
980 {
981   vec<data_reference_p> datarefs = vinfo->datarefs;
982   struct data_reference *dr;
983   unsigned int i;
984 
985   FOR_EACH_VEC_ELT (datarefs, i, dr)
986     {
987       gimple *stmt = DR_STMT (dr);
988       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
989 
990       if (!STMT_VINFO_RELEVANT_P (stmt_info))
991 	continue;
992 
993       /* For interleaving, only the alignment of the first access matters.   */
994       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
995 	  && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
996 	continue;
997 
998       /* Strided accesses perform only component accesses, alignment is
999 	 irrelevant for them.  */
1000       if (STMT_VINFO_STRIDED_P (stmt_info)
1001 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1002 	continue;
1003 
1004       if (! verify_data_ref_alignment (dr))
1005 	return false;
1006     }
1007 
1008   return true;
1009 }
1010 
1011 /* Given an memory reference EXP return whether its alignment is less
1012    than its size.  */
1013 
1014 static bool
1015 not_size_aligned (tree exp)
1016 {
1017   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1018     return true;
1019 
1020   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1021 	  > get_object_alignment (exp));
1022 }
1023 
1024 /* Function vector_alignment_reachable_p
1025 
1026    Return true if vector alignment for DR is reachable by peeling
1027    a few loop iterations.  Return false otherwise.  */
1028 
1029 static bool
1030 vector_alignment_reachable_p (struct data_reference *dr)
1031 {
1032   gimple *stmt = DR_STMT (dr);
1033   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1034   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1035 
1036   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1037     {
1038       /* For interleaved access we peel only if number of iterations in
1039 	 the prolog loop ({VF - misalignment}), is a multiple of the
1040 	 number of the interleaved accesses.  */
1041       int elem_size, mis_in_elements;
1042       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1043 
1044       /* FORNOW: handle only known alignment.  */
1045       if (!known_alignment_for_access_p (dr))
1046 	return false;
1047 
1048       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1049       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1050 
1051       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1052 	return false;
1053     }
1054 
1055   /* If misalignment is known at the compile time then allow peeling
1056      only if natural alignment is reachable through peeling.  */
1057   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1058     {
1059       HOST_WIDE_INT elmsize =
1060 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1061       if (dump_enabled_p ())
1062 	{
1063 	  dump_printf_loc (MSG_NOTE, vect_location,
1064 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1065 	  dump_printf (MSG_NOTE,
1066 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1067 	}
1068       if (DR_MISALIGNMENT (dr) % elmsize)
1069 	{
1070 	  if (dump_enabled_p ())
1071 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1072 	                     "data size does not divide the misalignment.\n");
1073 	  return false;
1074 	}
1075     }
1076 
1077   if (!known_alignment_for_access_p (dr))
1078     {
1079       tree type = TREE_TYPE (DR_REF (dr));
1080       bool is_packed = not_size_aligned (DR_REF (dr));
1081       if (dump_enabled_p ())
1082 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1083 	                 "Unknown misalignment, %snaturally aligned\n",
1084 			 is_packed ? "not " : "");
1085       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1086     }
1087 
1088   return true;
1089 }
1090 
1091 
1092 /* Calculate the cost of the memory access represented by DR.  */
1093 
1094 static void
1095 vect_get_data_access_cost (struct data_reference *dr,
1096                            unsigned int *inside_cost,
1097                            unsigned int *outside_cost,
1098 			   stmt_vector_for_cost *body_cost_vec)
1099 {
1100   gimple *stmt = DR_STMT (dr);
1101   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1102   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1103   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1104   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1105   int ncopies = vf / nunits;
1106 
1107   if (DR_IS_READ (dr))
1108     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1109 			NULL, body_cost_vec, false);
1110   else
1111     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1112 
1113   if (dump_enabled_p ())
1114     dump_printf_loc (MSG_NOTE, vect_location,
1115                      "vect_get_data_access_cost: inside_cost = %d, "
1116                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1117 }
1118 
1119 
1120 typedef struct _vect_peel_info
1121 {
1122   int npeel;
1123   struct data_reference *dr;
1124   unsigned int count;
1125 } *vect_peel_info;
1126 
1127 typedef struct _vect_peel_extended_info
1128 {
1129   struct _vect_peel_info peel_info;
1130   unsigned int inside_cost;
1131   unsigned int outside_cost;
1132   stmt_vector_for_cost body_cost_vec;
1133 } *vect_peel_extended_info;
1134 
1135 
1136 /* Peeling hashtable helpers.  */
1137 
1138 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1139 {
1140   static inline hashval_t hash (const _vect_peel_info *);
1141   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1142 };
1143 
1144 inline hashval_t
1145 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1146 {
1147   return (hashval_t) peel_info->npeel;
1148 }
1149 
1150 inline bool
1151 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1152 {
1153   return (a->npeel == b->npeel);
1154 }
1155 
1156 
1157 /* Insert DR into peeling hash table with NPEEL as key.  */
1158 
1159 static void
1160 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1161 			  loop_vec_info loop_vinfo, struct data_reference *dr,
1162                           int npeel)
1163 {
1164   struct _vect_peel_info elem, *slot;
1165   _vect_peel_info **new_slot;
1166   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1167 
1168   elem.npeel = npeel;
1169   slot = peeling_htab->find (&elem);
1170   if (slot)
1171     slot->count++;
1172   else
1173     {
1174       slot = XNEW (struct _vect_peel_info);
1175       slot->npeel = npeel;
1176       slot->dr = dr;
1177       slot->count = 1;
1178       new_slot = peeling_htab->find_slot (slot, INSERT);
1179       *new_slot = slot;
1180     }
1181 
1182   if (!supportable_dr_alignment
1183       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1184     slot->count += VECT_MAX_COST;
1185 }
1186 
1187 
1188 /* Traverse peeling hash table to find peeling option that aligns maximum
1189    number of data accesses.  */
1190 
1191 int
1192 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1193 				     _vect_peel_extended_info *max)
1194 {
1195   vect_peel_info elem = *slot;
1196 
1197   if (elem->count > max->peel_info.count
1198       || (elem->count == max->peel_info.count
1199           && max->peel_info.npeel > elem->npeel))
1200     {
1201       max->peel_info.npeel = elem->npeel;
1202       max->peel_info.count = elem->count;
1203       max->peel_info.dr = elem->dr;
1204     }
1205 
1206   return 1;
1207 }
1208 
1209 
1210 /* Traverse peeling hash table and calculate cost for each peeling option.
1211    Find the one with the lowest cost.  */
1212 
1213 int
1214 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1215 				   _vect_peel_extended_info *min)
1216 {
1217   vect_peel_info elem = *slot;
1218   int save_misalignment, dummy;
1219   unsigned int inside_cost = 0, outside_cost = 0, i;
1220   gimple *stmt = DR_STMT (elem->dr);
1221   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1222   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1223   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1224   struct data_reference *dr;
1225   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1226 
1227   prologue_cost_vec.create (2);
1228   body_cost_vec.create (2);
1229   epilogue_cost_vec.create (2);
1230 
1231   FOR_EACH_VEC_ELT (datarefs, i, dr)
1232     {
1233       stmt = DR_STMT (dr);
1234       stmt_info = vinfo_for_stmt (stmt);
1235       /* For interleaving, only the alignment of the first access
1236          matters.  */
1237       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1238           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1239         continue;
1240 
1241       /* Strided accesses perform only component accesses, alignment is
1242          irrelevant for them.  */
1243       if (STMT_VINFO_STRIDED_P (stmt_info)
1244 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1245 	continue;
1246 
1247       save_misalignment = DR_MISALIGNMENT (dr);
1248       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1249       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1250 				 &body_cost_vec);
1251       SET_DR_MISALIGNMENT (dr, save_misalignment);
1252     }
1253 
1254   outside_cost += vect_get_known_peeling_cost
1255     (loop_vinfo, elem->npeel, &dummy,
1256      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1257      &prologue_cost_vec, &epilogue_cost_vec);
1258 
1259   /* Prologue and epilogue costs are added to the target model later.
1260      These costs depend only on the scalar iteration cost, the
1261      number of peeling iterations finally chosen, and the number of
1262      misaligned statements.  So discard the information found here.  */
1263   prologue_cost_vec.release ();
1264   epilogue_cost_vec.release ();
1265 
1266   if (inside_cost < min->inside_cost
1267       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1268     {
1269       min->inside_cost = inside_cost;
1270       min->outside_cost = outside_cost;
1271       min->body_cost_vec.release ();
1272       min->body_cost_vec = body_cost_vec;
1273       min->peel_info.dr = elem->dr;
1274       min->peel_info.npeel = elem->npeel;
1275     }
1276   else
1277     body_cost_vec.release ();
1278 
1279   return 1;
1280 }
1281 
1282 
1283 /* Choose best peeling option by traversing peeling hash table and either
1284    choosing an option with the lowest cost (if cost model is enabled) or the
1285    option that aligns as many accesses as possible.  */
1286 
1287 static struct data_reference *
1288 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1289 				       loop_vec_info loop_vinfo,
1290                                        unsigned int *npeel,
1291 				       stmt_vector_for_cost *body_cost_vec)
1292 {
1293    struct _vect_peel_extended_info res;
1294 
1295    res.peel_info.dr = NULL;
1296    res.body_cost_vec = stmt_vector_for_cost ();
1297 
1298    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1299      {
1300        res.inside_cost = INT_MAX;
1301        res.outside_cost = INT_MAX;
1302        peeling_htab->traverse <_vect_peel_extended_info *,
1303 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1304      }
1305    else
1306      {
1307        res.peel_info.count = 0;
1308        peeling_htab->traverse <_vect_peel_extended_info *,
1309 	   		       vect_peeling_hash_get_most_frequent> (&res);
1310      }
1311 
1312    *npeel = res.peel_info.npeel;
1313    *body_cost_vec = res.body_cost_vec;
1314    return res.peel_info.dr;
1315 }
1316 
1317 
1318 /* Function vect_enhance_data_refs_alignment
1319 
1320    This pass will use loop versioning and loop peeling in order to enhance
1321    the alignment of data references in the loop.
1322 
1323    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1324    original loop is to be vectorized.  Any other loops that are created by
1325    the transformations performed in this pass - are not supposed to be
1326    vectorized.  This restriction will be relaxed.
1327 
1328    This pass will require a cost model to guide it whether to apply peeling
1329    or versioning or a combination of the two.  For example, the scheme that
1330    intel uses when given a loop with several memory accesses, is as follows:
1331    choose one memory access ('p') which alignment you want to force by doing
1332    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1333    other accesses are not necessarily aligned, or (2) use loop versioning to
1334    generate one loop in which all accesses are aligned, and another loop in
1335    which only 'p' is necessarily aligned.
1336 
1337    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1338    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1339    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1340 
1341    Devising a cost model is the most critical aspect of this work.  It will
1342    guide us on which access to peel for, whether to use loop versioning, how
1343    many versions to create, etc.  The cost model will probably consist of
1344    generic considerations as well as target specific considerations (on
1345    powerpc for example, misaligned stores are more painful than misaligned
1346    loads).
1347 
1348    Here are the general steps involved in alignment enhancements:
1349 
1350      -- original loop, before alignment analysis:
1351 	for (i=0; i<N; i++){
1352 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1353 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1354 	}
1355 
1356      -- After vect_compute_data_refs_alignment:
1357 	for (i=0; i<N; i++){
1358 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1359 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1360 	}
1361 
1362      -- Possibility 1: we do loop versioning:
1363      if (p is aligned) {
1364 	for (i=0; i<N; i++){	# loop 1A
1365 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1366 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1367 	}
1368      }
1369      else {
1370 	for (i=0; i<N; i++){	# loop 1B
1371 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1372 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1373 	}
1374      }
1375 
1376      -- Possibility 2: we do loop peeling:
1377      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1378 	x = q[i];
1379 	p[i] = y;
1380      }
1381      for (i = 3; i < N; i++){	# loop 2A
1382 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1383 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1384      }
1385 
1386      -- Possibility 3: combination of loop peeling and versioning:
1387      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1388 	x = q[i];
1389 	p[i] = y;
1390      }
1391      if (p is aligned) {
1392 	for (i = 3; i<N; i++){	# loop 3A
1393 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1394 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1395 	}
1396      }
1397      else {
1398 	for (i = 3; i<N; i++){	# loop 3B
1399 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1400 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1401 	}
1402      }
1403 
1404      These loops are later passed to loop_transform to be vectorized.  The
1405      vectorizer will use the alignment information to guide the transformation
1406      (whether to generate regular loads/stores, or with special handling for
1407      misalignment).  */
1408 
1409 bool
1410 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1411 {
1412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1414   enum dr_alignment_support supportable_dr_alignment;
1415   struct data_reference *dr0 = NULL, *first_store = NULL;
1416   struct data_reference *dr;
1417   unsigned int i, j;
1418   bool do_peeling = false;
1419   bool do_versioning = false;
1420   bool stat;
1421   gimple *stmt;
1422   stmt_vec_info stmt_info;
1423   unsigned int npeel = 0;
1424   bool all_misalignments_unknown = true;
1425   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1426   unsigned possible_npeel_number = 1;
1427   tree vectype;
1428   unsigned int nelements, mis, same_align_drs_max = 0;
1429   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1430   hash_table<peel_info_hasher> peeling_htab (1);
1431 
1432   if (dump_enabled_p ())
1433     dump_printf_loc (MSG_NOTE, vect_location,
1434                      "=== vect_enhance_data_refs_alignment ===\n");
1435 
1436   /* Reset data so we can safely be called multiple times.  */
1437   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1438   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1439 
1440   /* While cost model enhancements are expected in the future, the high level
1441      view of the code at this time is as follows:
1442 
1443      A) If there is a misaligned access then see if peeling to align
1444         this access can make all data references satisfy
1445         vect_supportable_dr_alignment.  If so, update data structures
1446         as needed and return true.
1447 
1448      B) If peeling wasn't possible and there is a data reference with an
1449         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1450         then see if loop versioning checks can be used to make all data
1451         references satisfy vect_supportable_dr_alignment.  If so, update
1452         data structures as needed and return true.
1453 
1454      C) If neither peeling nor versioning were successful then return false if
1455         any data reference does not satisfy vect_supportable_dr_alignment.
1456 
1457      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1458 
1459      Note, Possibility 3 above (which is peeling and versioning together) is not
1460      being done at this time.  */
1461 
1462   /* (1) Peeling to force alignment.  */
1463 
1464   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1465      Considerations:
1466      + How many accesses will become aligned due to the peeling
1467      - How many accesses will become unaligned due to the peeling,
1468        and the cost of misaligned accesses.
1469      - The cost of peeling (the extra runtime checks, the increase
1470        in code size).  */
1471 
1472   FOR_EACH_VEC_ELT (datarefs, i, dr)
1473     {
1474       stmt = DR_STMT (dr);
1475       stmt_info = vinfo_for_stmt (stmt);
1476 
1477       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1478 	continue;
1479 
1480       /* For interleaving, only the alignment of the first access
1481          matters.  */
1482       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1483           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1484         continue;
1485 
1486       /* For invariant accesses there is nothing to enhance.  */
1487       if (integer_zerop (DR_STEP (dr)))
1488 	continue;
1489 
1490       /* Strided accesses perform only component accesses, alignment is
1491 	 irrelevant for them.  */
1492       if (STMT_VINFO_STRIDED_P (stmt_info)
1493 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1494 	continue;
1495 
1496       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1497       do_peeling = vector_alignment_reachable_p (dr);
1498       if (do_peeling)
1499         {
1500           if (known_alignment_for_access_p (dr))
1501             {
1502               unsigned int npeel_tmp;
1503 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1504 						    size_zero_node) < 0;
1505 
1506               /* Save info about DR in the hash table.  */
1507               vectype = STMT_VINFO_VECTYPE (stmt_info);
1508               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1509               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1510                                                 TREE_TYPE (DR_REF (dr))));
1511               npeel_tmp = (negative
1512 			   ? (mis - nelements) : (nelements - mis))
1513 		  & (nelements - 1);
1514 
1515               /* For multiple types, it is possible that the bigger type access
1516                  will have more than one peeling option.  E.g., a loop with two
1517                  types: one of size (vector size / 4), and the other one of
1518                  size (vector size / 8).  Vectorization factor will 8.  If both
1519                  access are misaligned by 3, the first one needs one scalar
1520                  iteration to be aligned, and the second one needs 5.  But the
1521 		 first one will be aligned also by peeling 5 scalar
1522                  iterations, and in that case both accesses will be aligned.
1523                  Hence, except for the immediate peeling amount, we also want
1524                  to try to add full vector size, while we don't exceed
1525                  vectorization factor.
1526                  We do this automtically for cost model, since we calculate cost
1527                  for every peeling option.  */
1528               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1529 		{
1530 		  if (STMT_SLP_TYPE (stmt_info))
1531 		    possible_npeel_number
1532 		      = (vf * GROUP_SIZE (stmt_info)) / nelements;
1533 		  else
1534 		    possible_npeel_number = vf / nelements;
1535 		}
1536 
1537               /* Handle the aligned case. We may decide to align some other
1538                  access, making DR unaligned.  */
1539               if (DR_MISALIGNMENT (dr) == 0)
1540                 {
1541                   npeel_tmp = 0;
1542                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1543                     possible_npeel_number++;
1544                 }
1545 
1546               for (j = 0; j < possible_npeel_number; j++)
1547                 {
1548                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1549 					    dr, npeel_tmp);
1550                   npeel_tmp += nelements;
1551                 }
1552 
1553               all_misalignments_unknown = false;
1554               /* Data-ref that was chosen for the case that all the
1555                  misalignments are unknown is not relevant anymore, since we
1556                  have a data-ref with known alignment.  */
1557               dr0 = NULL;
1558             }
1559           else
1560             {
1561               /* If we don't know any misalignment values, we prefer
1562                  peeling for data-ref that has the maximum number of data-refs
1563                  with the same alignment, unless the target prefers to align
1564                  stores over load.  */
1565               if (all_misalignments_unknown)
1566                 {
1567 		  unsigned same_align_drs
1568 		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1569                   if (!dr0
1570 		      || same_align_drs_max < same_align_drs)
1571                     {
1572                       same_align_drs_max = same_align_drs;
1573                       dr0 = dr;
1574                     }
1575 		  /* For data-refs with the same number of related
1576 		     accesses prefer the one where the misalign
1577 		     computation will be invariant in the outermost loop.  */
1578 		  else if (same_align_drs_max == same_align_drs)
1579 		    {
1580 		      struct loop *ivloop0, *ivloop;
1581 		      ivloop0 = outermost_invariant_loop_for_expr
1582 			  (loop, DR_BASE_ADDRESS (dr0));
1583 		      ivloop = outermost_invariant_loop_for_expr
1584 			  (loop, DR_BASE_ADDRESS (dr));
1585 		      if ((ivloop && !ivloop0)
1586 			  || (ivloop && ivloop0
1587 			      && flow_loop_nested_p (ivloop, ivloop0)))
1588 			dr0 = dr;
1589 		    }
1590 
1591                   if (!first_store && DR_IS_WRITE (dr))
1592                     first_store = dr;
1593                 }
1594 
1595               /* If there are both known and unknown misaligned accesses in the
1596                  loop, we choose peeling amount according to the known
1597                  accesses.  */
1598               if (!supportable_dr_alignment)
1599                 {
1600                   dr0 = dr;
1601                   if (!first_store && DR_IS_WRITE (dr))
1602                     first_store = dr;
1603                 }
1604             }
1605         }
1606       else
1607         {
1608           if (!aligned_access_p (dr))
1609             {
1610               if (dump_enabled_p ())
1611                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1612                                  "vector alignment may not be reachable\n");
1613               break;
1614             }
1615         }
1616     }
1617 
1618   /* Check if we can possibly peel the loop.  */
1619   if (!vect_can_advance_ivs_p (loop_vinfo)
1620       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1621       || loop->inner)
1622     do_peeling = false;
1623 
1624   if (do_peeling
1625       && all_misalignments_unknown
1626       && vect_supportable_dr_alignment (dr0, false))
1627     {
1628       /* Check if the target requires to prefer stores over loads, i.e., if
1629          misaligned stores are more expensive than misaligned loads (taking
1630          drs with same alignment into account).  */
1631       if (first_store && DR_IS_READ (dr0))
1632         {
1633           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1634           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1635           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1636           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1637 	  stmt_vector_for_cost dummy;
1638 	  dummy.create (2);
1639 
1640           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1641 				     &dummy);
1642           vect_get_data_access_cost (first_store, &store_inside_cost,
1643 				     &store_outside_cost, &dummy);
1644 
1645 	  dummy.release ();
1646 
1647           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1648              aligning the load DR0).  */
1649           load_inside_penalty = store_inside_cost;
1650           load_outside_penalty = store_outside_cost;
1651           for (i = 0;
1652 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1653 			  DR_STMT (first_store))).iterate (i, &dr);
1654                i++)
1655             if (DR_IS_READ (dr))
1656               {
1657                 load_inside_penalty += load_inside_cost;
1658                 load_outside_penalty += load_outside_cost;
1659               }
1660             else
1661               {
1662                 load_inside_penalty += store_inside_cost;
1663                 load_outside_penalty += store_outside_cost;
1664               }
1665 
1666           /* Calculate the penalty for leaving DR0 unaligned (by
1667              aligning the FIRST_STORE).  */
1668           store_inside_penalty = load_inside_cost;
1669           store_outside_penalty = load_outside_cost;
1670           for (i = 0;
1671 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1672 		      DR_STMT (dr0))).iterate (i, &dr);
1673                i++)
1674             if (DR_IS_READ (dr))
1675               {
1676                 store_inside_penalty += load_inside_cost;
1677                 store_outside_penalty += load_outside_cost;
1678               }
1679             else
1680               {
1681                 store_inside_penalty += store_inside_cost;
1682                 store_outside_penalty += store_outside_cost;
1683               }
1684 
1685           if (load_inside_penalty > store_inside_penalty
1686               || (load_inside_penalty == store_inside_penalty
1687                   && load_outside_penalty > store_outside_penalty))
1688             dr0 = first_store;
1689         }
1690 
1691       /* In case there are only loads with different unknown misalignments, use
1692          peeling only if it may help to align other accesses in the loop or
1693 	 if it may help improving load bandwith when we'd end up using
1694 	 unaligned loads.  */
1695       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1696       if (!first_store
1697 	  && !STMT_VINFO_SAME_ALIGN_REFS (
1698 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1699 	  && (vect_supportable_dr_alignment (dr0, false)
1700 	      != dr_unaligned_supported
1701 	      || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1702 		  == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1703         do_peeling = false;
1704     }
1705 
1706   if (do_peeling && !dr0)
1707     {
1708       /* Peeling is possible, but there is no data access that is not supported
1709          unless aligned. So we try to choose the best possible peeling.  */
1710 
1711       /* We should get here only if there are drs with known misalignment.  */
1712       gcc_assert (!all_misalignments_unknown);
1713 
1714       /* Choose the best peeling from the hash table.  */
1715       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1716 						   loop_vinfo, &npeel,
1717 						   &body_cost_vec);
1718       if (!dr0 || !npeel)
1719         do_peeling = false;
1720     }
1721 
1722   if (do_peeling)
1723     {
1724       stmt = DR_STMT (dr0);
1725       stmt_info = vinfo_for_stmt (stmt);
1726       vectype = STMT_VINFO_VECTYPE (stmt_info);
1727       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1728 
1729       if (known_alignment_for_access_p (dr0))
1730         {
1731 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1732 						size_zero_node) < 0;
1733           if (!npeel)
1734             {
1735               /* Since it's known at compile time, compute the number of
1736                  iterations in the peeled loop (the peeling factor) for use in
1737                  updating DR_MISALIGNMENT values.  The peeling factor is the
1738                  vectorization factor minus the misalignment as an element
1739                  count.  */
1740               mis = DR_MISALIGNMENT (dr0);
1741               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1742               npeel = ((negative ? mis - nelements : nelements - mis)
1743 		       & (nelements - 1));
1744             }
1745 
1746 	  /* For interleaved data access every iteration accesses all the
1747 	     members of the group, therefore we divide the number of iterations
1748 	     by the group size.  */
1749 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1750 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1751 	    npeel /= GROUP_SIZE (stmt_info);
1752 
1753           if (dump_enabled_p ())
1754             dump_printf_loc (MSG_NOTE, vect_location,
1755                              "Try peeling by %d\n", npeel);
1756         }
1757 
1758       /* Ensure that all data refs can be vectorized after the peel.  */
1759       FOR_EACH_VEC_ELT (datarefs, i, dr)
1760         {
1761           int save_misalignment;
1762 
1763 	  if (dr == dr0)
1764 	    continue;
1765 
1766 	  stmt = DR_STMT (dr);
1767 	  stmt_info = vinfo_for_stmt (stmt);
1768 	  /* For interleaving, only the alignment of the first access
1769             matters.  */
1770 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1771 	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1772 	    continue;
1773 
1774 	  /* Strided accesses perform only component accesses, alignment is
1775 	     irrelevant for them.  */
1776 	  if (STMT_VINFO_STRIDED_P (stmt_info)
1777 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1778 	    continue;
1779 
1780 	  save_misalignment = DR_MISALIGNMENT (dr);
1781 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1782 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1783 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1784 
1785 	  if (!supportable_dr_alignment)
1786 	    {
1787 	      do_peeling = false;
1788 	      break;
1789 	    }
1790 	}
1791 
1792       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1793         {
1794           stat = vect_verify_datarefs_alignment (loop_vinfo);
1795           if (!stat)
1796             do_peeling = false;
1797           else
1798 	    {
1799 	      body_cost_vec.release ();
1800 	      return stat;
1801 	    }
1802         }
1803 
1804       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1805       if (do_peeling)
1806         {
1807           unsigned max_allowed_peel
1808             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1809           if (max_allowed_peel != (unsigned)-1)
1810             {
1811               unsigned max_peel = npeel;
1812               if (max_peel == 0)
1813                 {
1814 		  gimple *dr_stmt = DR_STMT (dr0);
1815                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1816                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1817                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1818                 }
1819               if (max_peel > max_allowed_peel)
1820                 {
1821                   do_peeling = false;
1822                   if (dump_enabled_p ())
1823                     dump_printf_loc (MSG_NOTE, vect_location,
1824                         "Disable peeling, max peels reached: %d\n", max_peel);
1825                 }
1826             }
1827         }
1828 
1829       /* Cost model #2 - if peeling may result in a remaining loop not
1830 	 iterating enough to be vectorized then do not peel.  */
1831       if (do_peeling
1832 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1833 	{
1834 	  unsigned max_peel
1835 	    = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1836 	  if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1837 	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1838 	    do_peeling = false;
1839 	}
1840 
1841       if (do_peeling)
1842         {
1843           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1844              If the misalignment of DR_i is identical to that of dr0 then set
1845              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1846              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1847              by the peeling factor times the element size of DR_i (MOD the
1848              vectorization factor times the size).  Otherwise, the
1849              misalignment of DR_i must be set to unknown.  */
1850 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1851 	    if (dr != dr0)
1852 	      {
1853 		/* Strided accesses perform only component accesses, alignment
1854 		   is irrelevant for them.  */
1855 		stmt_info = vinfo_for_stmt (DR_STMT (dr));
1856 		if (STMT_VINFO_STRIDED_P (stmt_info)
1857 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1858 		  continue;
1859 
1860 		vect_update_misalignment_for_peel (dr, dr0, npeel);
1861 	      }
1862 
1863           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1864           if (npeel)
1865             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1866           else
1867             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1868 	      = DR_MISALIGNMENT (dr0);
1869 	  SET_DR_MISALIGNMENT (dr0, 0);
1870 	  if (dump_enabled_p ())
1871             {
1872               dump_printf_loc (MSG_NOTE, vect_location,
1873                                "Alignment of access forced using peeling.\n");
1874               dump_printf_loc (MSG_NOTE, vect_location,
1875                                "Peeling for alignment will be applied.\n");
1876             }
1877 	  /* The inside-loop cost will be accounted for in vectorizable_load
1878 	     and vectorizable_store correctly with adjusted alignments.
1879 	     Drop the body_cst_vec on the floor here.  */
1880 	  body_cost_vec.release ();
1881 
1882 	  stat = vect_verify_datarefs_alignment (loop_vinfo);
1883 	  gcc_assert (stat);
1884           return stat;
1885         }
1886     }
1887 
1888   body_cost_vec.release ();
1889 
1890   /* (2) Versioning to force alignment.  */
1891 
1892   /* Try versioning if:
1893      1) optimize loop for speed
1894      2) there is at least one unsupported misaligned data ref with an unknown
1895         misalignment, and
1896      3) all misaligned data refs with a known misalignment are supported, and
1897      4) the number of runtime alignment checks is within reason.  */
1898 
1899   do_versioning =
1900 	optimize_loop_nest_for_speed_p (loop)
1901 	&& (!loop->inner); /* FORNOW */
1902 
1903   if (do_versioning)
1904     {
1905       FOR_EACH_VEC_ELT (datarefs, i, dr)
1906         {
1907 	  stmt = DR_STMT (dr);
1908 	  stmt_info = vinfo_for_stmt (stmt);
1909 
1910 	  /* For interleaving, only the alignment of the first access
1911 	     matters.  */
1912 	  if (aligned_access_p (dr)
1913 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1914 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1915 	    continue;
1916 
1917 	  if (STMT_VINFO_STRIDED_P (stmt_info))
1918 	    {
1919 	      /* Strided loads perform only component accesses, alignment is
1920 		 irrelevant for them.  */
1921 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1922 		continue;
1923 	      do_versioning = false;
1924 	      break;
1925 	    }
1926 
1927 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1928 
1929           if (!supportable_dr_alignment)
1930             {
1931 	      gimple *stmt;
1932               int mask;
1933               tree vectype;
1934 
1935               if (known_alignment_for_access_p (dr)
1936                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1937                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1938                 {
1939                   do_versioning = false;
1940                   break;
1941                 }
1942 
1943               stmt = DR_STMT (dr);
1944               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1945               gcc_assert (vectype);
1946 
1947               /* The rightmost bits of an aligned address must be zeros.
1948                  Construct the mask needed for this test.  For example,
1949                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1950                  mask must be 15 = 0xf. */
1951               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1952 
1953               /* FORNOW: use the same mask to test all potentially unaligned
1954                  references in the loop.  The vectorizer currently supports
1955                  a single vector size, see the reference to
1956                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1957                  vectorization factor is computed.  */
1958               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1959                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1960               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1961               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1962 		      DR_STMT (dr));
1963             }
1964         }
1965 
1966       /* Versioning requires at least one misaligned data reference.  */
1967       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1968         do_versioning = false;
1969       else if (!do_versioning)
1970         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1971     }
1972 
1973   if (do_versioning)
1974     {
1975       vec<gimple *> may_misalign_stmts
1976         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1977       gimple *stmt;
1978 
1979       /* It can now be assumed that the data references in the statements
1980          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1981          of the loop being vectorized.  */
1982       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1983         {
1984           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1985           dr = STMT_VINFO_DATA_REF (stmt_info);
1986 	  SET_DR_MISALIGNMENT (dr, 0);
1987 	  if (dump_enabled_p ())
1988             dump_printf_loc (MSG_NOTE, vect_location,
1989                              "Alignment of access forced using versioning.\n");
1990         }
1991 
1992       if (dump_enabled_p ())
1993         dump_printf_loc (MSG_NOTE, vect_location,
1994                          "Versioning for alignment will be applied.\n");
1995 
1996       /* Peeling and versioning can't be done together at this time.  */
1997       gcc_assert (! (do_peeling && do_versioning));
1998 
1999       stat = vect_verify_datarefs_alignment (loop_vinfo);
2000       gcc_assert (stat);
2001       return stat;
2002     }
2003 
2004   /* This point is reached if neither peeling nor versioning is being done.  */
2005   gcc_assert (! (do_peeling || do_versioning));
2006 
2007   stat = vect_verify_datarefs_alignment (loop_vinfo);
2008   return stat;
2009 }
2010 
2011 
2012 /* Function vect_find_same_alignment_drs.
2013 
2014    Update group and alignment relations according to the chosen
2015    vectorization factor.  */
2016 
2017 static void
2018 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2019 			      loop_vec_info loop_vinfo)
2020 {
2021   unsigned int i;
2022   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2023   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2024   struct data_reference *dra = DDR_A (ddr);
2025   struct data_reference *drb = DDR_B (ddr);
2026   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2027   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2028   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2029   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2030   lambda_vector dist_v;
2031   unsigned int loop_depth;
2032 
2033   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2034     return;
2035 
2036   if (dra == drb)
2037     return;
2038 
2039   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2040     return;
2041 
2042   /* Loop-based vectorization and known data dependence.  */
2043   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2044     return;
2045 
2046   /* Data-dependence analysis reports a distance vector of zero
2047      for data-references that overlap only in the first iteration
2048      but have different sign step (see PR45764).
2049      So as a sanity check require equal DR_STEP.  */
2050   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2051     return;
2052 
2053   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2054   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2055     {
2056       int dist = dist_v[loop_depth];
2057 
2058       if (dump_enabled_p ())
2059 	dump_printf_loc (MSG_NOTE, vect_location,
2060 	                 "dependence distance  = %d.\n", dist);
2061 
2062       /* Same loop iteration.  */
2063       if (dist == 0
2064 	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
2065 	{
2066 	  /* Two references with distance zero have the same alignment.  */
2067 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2068 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2069 	  if (dump_enabled_p ())
2070 	    {
2071 	      dump_printf_loc (MSG_NOTE, vect_location,
2072 	                       "accesses have the same alignment.\n");
2073 	      dump_printf (MSG_NOTE,
2074 	                   "dependence distance modulo vf == 0 between ");
2075 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2076 	      dump_printf (MSG_NOTE,  " and ");
2077 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2078 	      dump_printf (MSG_NOTE, "\n");
2079 	    }
2080 	}
2081     }
2082 }
2083 
2084 
2085 /* Function vect_analyze_data_refs_alignment
2086 
2087    Analyze the alignment of the data-references in the loop.
2088    Return FALSE if a data reference is found that cannot be vectorized.  */
2089 
2090 bool
2091 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2092 {
2093   if (dump_enabled_p ())
2094     dump_printf_loc (MSG_NOTE, vect_location,
2095                      "=== vect_analyze_data_refs_alignment ===\n");
2096 
2097   /* Mark groups of data references with same alignment using
2098      data dependence information.  */
2099   vec<ddr_p> ddrs = vinfo->ddrs;
2100   struct data_dependence_relation *ddr;
2101   unsigned int i;
2102 
2103   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2104     vect_find_same_alignment_drs (ddr, vinfo);
2105 
2106   vec<data_reference_p> datarefs = vinfo->datarefs;
2107   struct data_reference *dr;
2108 
2109   FOR_EACH_VEC_ELT (datarefs, i, dr)
2110     {
2111       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2112       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2113 	  && !vect_compute_data_ref_alignment (dr))
2114 	{
2115 	  /* Strided accesses perform only component accesses, misalignment
2116 	     information is irrelevant for them.  */
2117 	  if (STMT_VINFO_STRIDED_P (stmt_info)
2118 	      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2119 	    continue;
2120 
2121 	  if (dump_enabled_p ())
2122 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2123 			     "not vectorized: can't calculate alignment "
2124 			     "for data ref.\n");
2125 
2126 	  return false;
2127 	}
2128     }
2129 
2130   return true;
2131 }
2132 
2133 
2134 /* Analyze alignment of DRs of stmts in NODE.  */
2135 
2136 static bool
2137 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2138 {
2139   /* We vectorize from the first scalar stmt in the node unless
2140      the node is permuted in which case we start from the first
2141      element in the group.  */
2142   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2143   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2144   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2145     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2146 
2147   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2148   if (! vect_compute_data_ref_alignment (dr)
2149       /* For creating the data-ref pointer we need alignment of the
2150 	 first element anyway.  */
2151       || (dr != first_dr
2152 	  && ! vect_compute_data_ref_alignment (first_dr))
2153       || ! verify_data_ref_alignment (dr))
2154     {
2155       if (dump_enabled_p ())
2156 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2157 			 "not vectorized: bad data alignment in basic "
2158 			 "block.\n");
2159       return false;
2160     }
2161 
2162   return true;
2163 }
2164 
2165 /* Function vect_slp_analyze_instance_alignment
2166 
2167    Analyze the alignment of the data-references in the SLP instance.
2168    Return FALSE if a data reference is found that cannot be vectorized.  */
2169 
2170 bool
2171 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2172 {
2173   if (dump_enabled_p ())
2174     dump_printf_loc (MSG_NOTE, vect_location,
2175                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2176 
2177   slp_tree node;
2178   unsigned i;
2179   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2180     if (! vect_slp_analyze_and_verify_node_alignment (node))
2181       return false;
2182 
2183   node = SLP_INSTANCE_TREE (instance);
2184   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2185       && ! vect_slp_analyze_and_verify_node_alignment
2186 	     (SLP_INSTANCE_TREE (instance)))
2187     return false;
2188 
2189   return true;
2190 }
2191 
2192 
2193 /* Analyze groups of accesses: check that DR belongs to a group of
2194    accesses of legal size, step, etc.  Detect gaps, single element
2195    interleaving, and other special cases. Set grouped access info.
2196    Collect groups of strided stores for further use in SLP analysis.
2197    Worker for vect_analyze_group_access.  */
2198 
2199 static bool
2200 vect_analyze_group_access_1 (struct data_reference *dr)
2201 {
2202   tree step = DR_STEP (dr);
2203   tree scalar_type = TREE_TYPE (DR_REF (dr));
2204   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2205   gimple *stmt = DR_STMT (dr);
2206   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2207   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2208   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2209   HOST_WIDE_INT dr_step = -1;
2210   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2211   bool slp_impossible = false;
2212 
2213   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2214      size of the interleaving group (including gaps).  */
2215   if (tree_fits_shwi_p (step))
2216     {
2217       dr_step = tree_to_shwi (step);
2218       /* Check that STEP is a multiple of type size.  Otherwise there is
2219          a non-element-sized gap at the end of the group which we
2220 	 cannot represent in GROUP_GAP or GROUP_SIZE.
2221 	 ???  As we can handle non-constant step fine here we should
2222 	 simply remove uses of GROUP_GAP between the last and first
2223 	 element and instead rely on DR_STEP.  GROUP_SIZE then would
2224 	 simply not include that gap.  */
2225       if ((dr_step % type_size) != 0)
2226 	{
2227 	  if (dump_enabled_p ())
2228 	    {
2229 	      dump_printf_loc (MSG_NOTE, vect_location,
2230 	                       "Step ");
2231 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2232 	      dump_printf (MSG_NOTE,
2233 			   " is not a multiple of the element size for ");
2234 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2235 	      dump_printf (MSG_NOTE, "\n");
2236 	    }
2237 	  return false;
2238 	}
2239       groupsize = absu_hwi (dr_step) / type_size;
2240     }
2241   else
2242     groupsize = 0;
2243 
2244   /* Not consecutive access is possible only if it is a part of interleaving.  */
2245   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2246     {
2247       /* Check if it this DR is a part of interleaving, and is a single
2248 	 element of the group that is accessed in the loop.  */
2249 
2250       /* Gaps are supported only for loads. STEP must be a multiple of the type
2251 	 size.  The size of the group must be a power of 2.  */
2252       if (DR_IS_READ (dr)
2253 	  && (dr_step % type_size) == 0
2254 	  && groupsize > 0
2255 	  && exact_log2 (groupsize) != -1)
2256 	{
2257 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2258 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2259 	  if (dump_enabled_p ())
2260 	    {
2261 	      dump_printf_loc (MSG_NOTE, vect_location,
2262 	                       "Detected single element interleaving ");
2263 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2264 	      dump_printf (MSG_NOTE, " step ");
2265 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2266 	      dump_printf (MSG_NOTE, "\n");
2267 	    }
2268 
2269 	  return true;
2270 	}
2271 
2272       if (dump_enabled_p ())
2273         {
2274  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 	                   "not consecutive access ");
2276 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2277         }
2278 
2279       if (bb_vinfo)
2280         {
2281           /* Mark the statement as unvectorizable.  */
2282           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2283           return true;
2284         }
2285 
2286       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2287       STMT_VINFO_STRIDED_P (stmt_info) = true;
2288       return true;
2289     }
2290 
2291   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2292     {
2293       /* First stmt in the interleaving chain. Check the chain.  */
2294       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2295       struct data_reference *data_ref = dr;
2296       unsigned int count = 1;
2297       tree prev_init = DR_INIT (data_ref);
2298       gimple *prev = stmt;
2299       HOST_WIDE_INT diff, gaps = 0;
2300 
2301       while (next)
2302         {
2303           /* Skip same data-refs.  In case that two or more stmts share
2304              data-ref (supported only for loads), we vectorize only the first
2305              stmt, and the rest get their vectorized loads from the first
2306              one.  */
2307           if (!tree_int_cst_compare (DR_INIT (data_ref),
2308                                      DR_INIT (STMT_VINFO_DATA_REF (
2309 						   vinfo_for_stmt (next)))))
2310             {
2311               if (DR_IS_WRITE (data_ref))
2312                 {
2313                   if (dump_enabled_p ())
2314                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315                                      "Two store stmts share the same dr.\n");
2316                   return false;
2317                 }
2318 
2319 	      if (dump_enabled_p ())
2320 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2321 				 "Two or more load stmts share the same dr.\n");
2322 
2323               /* For load use the same data-ref load.  */
2324               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2325 
2326               prev = next;
2327               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2328               continue;
2329             }
2330 
2331           prev = next;
2332           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2333 
2334 	  /* All group members have the same STEP by construction.  */
2335 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2336 
2337           /* Check that the distance between two accesses is equal to the type
2338              size. Otherwise, we have gaps.  */
2339           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2340                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2341 	  if (diff != 1)
2342 	    {
2343 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2344 	      slp_impossible = true;
2345 	      if (DR_IS_WRITE (data_ref))
2346 		{
2347                   if (dump_enabled_p ())
2348                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349                                      "interleaved store with gaps\n");
2350 		  return false;
2351 		}
2352 
2353               gaps += diff - 1;
2354 	    }
2355 
2356 	  last_accessed_element += diff;
2357 
2358           /* Store the gap from the previous member of the group. If there is no
2359              gap in the access, GROUP_GAP is always 1.  */
2360           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2361 
2362           prev_init = DR_INIT (data_ref);
2363           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2364           /* Count the number of data-refs in the chain.  */
2365           count++;
2366         }
2367 
2368       if (groupsize == 0)
2369         groupsize = count + gaps;
2370 
2371       if (groupsize > UINT_MAX)
2372 	{
2373 	  if (dump_enabled_p ())
2374 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 			     "group is too large\n");
2376 	  return false;
2377 	}
2378 
2379       /* Check that the size of the interleaving is equal to count for stores,
2380          i.e., that there are no gaps.  */
2381       if (groupsize != count
2382 	  && !DR_IS_READ (dr))
2383         {
2384 	  if (dump_enabled_p ())
2385 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386 			     "interleaved store with gaps\n");
2387 	  return false;
2388 	}
2389 
2390       /* If there is a gap after the last load in the group it is the
2391 	 difference between the groupsize and the last accessed
2392 	 element.
2393 	 When there is no gap, this difference should be 0.  */
2394       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2395 
2396       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2397       if (dump_enabled_p ())
2398 	{
2399 	  dump_printf_loc (MSG_NOTE, vect_location,
2400 			   "Detected interleaving ");
2401 	  if (DR_IS_READ (dr))
2402 	    dump_printf (MSG_NOTE, "load ");
2403 	  else
2404 	    dump_printf (MSG_NOTE, "store ");
2405 	  dump_printf (MSG_NOTE, "of size %u starting with ",
2406 		       (unsigned)groupsize);
2407 	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2408 	  if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2409 	    dump_printf_loc (MSG_NOTE, vect_location,
2410 			     "There is a gap of %u elements after the group\n",
2411 			     GROUP_GAP (vinfo_for_stmt (stmt)));
2412 	}
2413 
2414       /* SLP: create an SLP data structure for every interleaving group of
2415 	 stores for further analysis in vect_analyse_slp.  */
2416       if (DR_IS_WRITE (dr) && !slp_impossible)
2417         {
2418           if (loop_vinfo)
2419             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2420           if (bb_vinfo)
2421             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2422         }
2423     }
2424 
2425   return true;
2426 }
2427 
2428 /* Analyze groups of accesses: check that DR belongs to a group of
2429    accesses of legal size, step, etc.  Detect gaps, single element
2430    interleaving, and other special cases. Set grouped access info.
2431    Collect groups of strided stores for further use in SLP analysis.  */
2432 
2433 static bool
2434 vect_analyze_group_access (struct data_reference *dr)
2435 {
2436   if (!vect_analyze_group_access_1 (dr))
2437     {
2438       /* Dissolve the group if present.  */
2439       gimple *next;
2440       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2441       while (stmt)
2442 	{
2443 	  stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2444 	  next = GROUP_NEXT_ELEMENT (vinfo);
2445 	  GROUP_FIRST_ELEMENT (vinfo) = NULL;
2446 	  GROUP_NEXT_ELEMENT (vinfo) = NULL;
2447 	  stmt = next;
2448 	}
2449       return false;
2450     }
2451   return true;
2452 }
2453 
2454 /* Analyze the access pattern of the data-reference DR.
2455    In case of non-consecutive accesses call vect_analyze_group_access() to
2456    analyze groups of accesses.  */
2457 
2458 static bool
2459 vect_analyze_data_ref_access (struct data_reference *dr)
2460 {
2461   tree step = DR_STEP (dr);
2462   tree scalar_type = TREE_TYPE (DR_REF (dr));
2463   gimple *stmt = DR_STMT (dr);
2464   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2465   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2466   struct loop *loop = NULL;
2467 
2468   if (loop_vinfo)
2469     loop = LOOP_VINFO_LOOP (loop_vinfo);
2470 
2471   if (loop_vinfo && !step)
2472     {
2473       if (dump_enabled_p ())
2474 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2475 	                 "bad data-ref access in loop\n");
2476       return false;
2477     }
2478 
2479   /* Allow loads with zero step in inner-loop vectorization.  */
2480   if (loop_vinfo && integer_zerop (step))
2481     {
2482       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2483       if (!nested_in_vect_loop_p (loop, stmt))
2484 	return DR_IS_READ (dr);
2485       /* Allow references with zero step for outer loops marked
2486 	 with pragma omp simd only - it guarantees absence of
2487 	 loop-carried dependencies between inner loop iterations.  */
2488       if (!loop->force_vectorize)
2489 	{
2490 	  if (dump_enabled_p ())
2491 	    dump_printf_loc (MSG_NOTE, vect_location,
2492 			     "zero step in inner loop of nest\n");
2493 	  return false;
2494 	}
2495     }
2496 
2497   if (loop && nested_in_vect_loop_p (loop, stmt))
2498     {
2499       /* Interleaved accesses are not yet supported within outer-loop
2500         vectorization for references in the inner-loop.  */
2501       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2502 
2503       /* For the rest of the analysis we use the outer-loop step.  */
2504       step = STMT_VINFO_DR_STEP (stmt_info);
2505       if (integer_zerop (step))
2506 	{
2507 	  if (dump_enabled_p ())
2508 	    dump_printf_loc (MSG_NOTE, vect_location,
2509 	                     "zero step in outer loop.\n");
2510 	  return DR_IS_READ (dr);
2511 	}
2512     }
2513 
2514   /* Consecutive?  */
2515   if (TREE_CODE (step) == INTEGER_CST)
2516     {
2517       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2518       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2519 	  || (dr_step < 0
2520 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2521 	{
2522 	  /* Mark that it is not interleaving.  */
2523 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2524 	  return true;
2525 	}
2526     }
2527 
2528   if (loop && nested_in_vect_loop_p (loop, stmt))
2529     {
2530       if (dump_enabled_p ())
2531 	dump_printf_loc (MSG_NOTE, vect_location,
2532 	                 "grouped access in outer loop.\n");
2533       return false;
2534     }
2535 
2536 
2537   /* Assume this is a DR handled by non-constant strided load case.  */
2538   if (TREE_CODE (step) != INTEGER_CST)
2539     return (STMT_VINFO_STRIDED_P (stmt_info)
2540 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2541 		|| vect_analyze_group_access (dr)));
2542 
2543   /* Not consecutive access - check if it's a part of interleaving group.  */
2544   return vect_analyze_group_access (dr);
2545 }
2546 
2547 
2548 
2549 /*  A helper function used in the comparator function to sort data
2550     references.  T1 and T2 are two data references to be compared.
2551     The function returns -1, 0, or 1.  */
2552 
2553 static int
2554 compare_tree (tree t1, tree t2)
2555 {
2556   int i, cmp;
2557   enum tree_code code;
2558   char tclass;
2559 
2560   if (t1 == t2)
2561     return 0;
2562   if (t1 == NULL)
2563     return -1;
2564   if (t2 == NULL)
2565     return 1;
2566 
2567   STRIP_NOPS (t1);
2568   STRIP_NOPS (t2);
2569 
2570   if (TREE_CODE (t1) != TREE_CODE (t2))
2571     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2572 
2573   code = TREE_CODE (t1);
2574   switch (code)
2575     {
2576     /* For const values, we can just use hash values for comparisons.  */
2577     case INTEGER_CST:
2578     case REAL_CST:
2579     case FIXED_CST:
2580     case STRING_CST:
2581     case COMPLEX_CST:
2582     case VECTOR_CST:
2583       {
2584 	hashval_t h1 = iterative_hash_expr (t1, 0);
2585 	hashval_t h2 = iterative_hash_expr (t2, 0);
2586 	if (h1 != h2)
2587 	  return h1 < h2 ? -1 : 1;
2588 	break;
2589       }
2590 
2591     case SSA_NAME:
2592       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2593       if (cmp != 0)
2594 	return cmp;
2595 
2596       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2597 	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2598       break;
2599 
2600     default:
2601       tclass = TREE_CODE_CLASS (code);
2602 
2603       /* For var-decl, we could compare their UIDs.  */
2604       if (tclass == tcc_declaration)
2605 	{
2606 	  if (DECL_UID (t1) != DECL_UID (t2))
2607 	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2608 	  break;
2609 	}
2610 
2611       /* For expressions with operands, compare their operands recursively.  */
2612       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2613 	{
2614 	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2615 	  if (cmp != 0)
2616 	    return cmp;
2617 	}
2618     }
2619 
2620   return 0;
2621 }
2622 
2623 
2624 /* Compare two data-references DRA and DRB to group them into chunks
2625    suitable for grouping.  */
2626 
2627 static int
2628 dr_group_sort_cmp (const void *dra_, const void *drb_)
2629 {
2630   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2631   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2632   int cmp;
2633 
2634   /* Stabilize sort.  */
2635   if (dra == drb)
2636     return 0;
2637 
2638   /* DRs in different loops never belong to the same group.  */
2639   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2640   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2641   if (loopa != loopb)
2642     return loopa->num < loopb->num ? -1 : 1;
2643 
2644   /* Ordering of DRs according to base.  */
2645   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2646     {
2647       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2648       if (cmp != 0)
2649         return cmp;
2650     }
2651 
2652   /* And according to DR_OFFSET.  */
2653   if (!dr_equal_offsets_p (dra, drb))
2654     {
2655       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2656       if (cmp != 0)
2657         return cmp;
2658     }
2659 
2660   /* Put reads before writes.  */
2661   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2662     return DR_IS_READ (dra) ? -1 : 1;
2663 
2664   /* Then sort after access size.  */
2665   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2666 			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2667     {
2668       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2669                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2670       if (cmp != 0)
2671         return cmp;
2672     }
2673 
2674   /* And after step.  */
2675   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2676     {
2677       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2678       if (cmp != 0)
2679         return cmp;
2680     }
2681 
2682   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2683   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2684   if (cmp == 0)
2685     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2686   return cmp;
2687 }
2688 
2689 /* Function vect_analyze_data_ref_accesses.
2690 
2691    Analyze the access pattern of all the data references in the loop.
2692 
2693    FORNOW: the only access pattern that is considered vectorizable is a
2694 	   simple step 1 (consecutive) access.
2695 
2696    FORNOW: handle only arrays and pointer accesses.  */
2697 
2698 bool
2699 vect_analyze_data_ref_accesses (vec_info *vinfo)
2700 {
2701   unsigned int i;
2702   vec<data_reference_p> datarefs = vinfo->datarefs;
2703   struct data_reference *dr;
2704 
2705   if (dump_enabled_p ())
2706     dump_printf_loc (MSG_NOTE, vect_location,
2707                      "=== vect_analyze_data_ref_accesses ===\n");
2708 
2709   if (datarefs.is_empty ())
2710     return true;
2711 
2712   /* Sort the array of datarefs to make building the interleaving chains
2713      linear.  Don't modify the original vector's order, it is needed for
2714      determining what dependencies are reversed.  */
2715   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2716   datarefs_copy.qsort (dr_group_sort_cmp);
2717 
2718   /* Build the interleaving chains.  */
2719   for (i = 0; i < datarefs_copy.length () - 1;)
2720     {
2721       data_reference_p dra = datarefs_copy[i];
2722       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2723       stmt_vec_info lastinfo = NULL;
2724       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2725 	{
2726 	  ++i;
2727 	  continue;
2728 	}
2729       for (i = i + 1; i < datarefs_copy.length (); ++i)
2730 	{
2731 	  data_reference_p drb = datarefs_copy[i];
2732 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2733 	  if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2734 	    break;
2735 
2736 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2737 	     accesses, same accesses) can lead to a group to be artificially
2738 	     split here as we don't just skip over those.  If it really
2739 	     matters we can push those to a worklist and re-iterate
2740 	     over them.  The we can just skip ahead to the next DR here.  */
2741 
2742 	  /* DRs in a different loop should not be put into the same
2743 	     interleaving group.  */
2744 	  if (gimple_bb (DR_STMT (dra))->loop_father
2745 	      != gimple_bb (DR_STMT (drb))->loop_father)
2746 	    break;
2747 
2748 	  /* Check that the data-refs have same first location (except init)
2749 	     and they are both either store or load (not load and store,
2750 	     not masked loads or stores).  */
2751 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2752 	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2753 				   DR_BASE_ADDRESS (drb), 0)
2754 	      || !dr_equal_offsets_p (dra, drb)
2755 	      || !gimple_assign_single_p (DR_STMT (dra))
2756 	      || !gimple_assign_single_p (DR_STMT (drb)))
2757 	    break;
2758 
2759 	  /* Check that the data-refs have the same constant size.  */
2760 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2761 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2762 	  if (!tree_fits_uhwi_p (sza)
2763 	      || !tree_fits_uhwi_p (szb)
2764 	      || !tree_int_cst_equal (sza, szb))
2765 	    break;
2766 
2767 	  /* Check that the data-refs have the same step.  */
2768 	  if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2769 	    break;
2770 
2771 	  /* Do not place the same access in the interleaving chain twice.  */
2772 	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2773 	    break;
2774 
2775 	  /* Check the types are compatible.
2776 	     ???  We don't distinguish this during sorting.  */
2777 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2778 				   TREE_TYPE (DR_REF (drb))))
2779 	    break;
2780 
2781 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2782 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2783 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2784 	  gcc_assert (init_a <= init_b);
2785 
2786 	  /* If init_b == init_a + the size of the type * k, we have an
2787 	     interleaving, and DRA is accessed before DRB.  */
2788 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2789 	  if (type_size_a == 0
2790 	      || (init_b - init_a) % type_size_a != 0)
2791 	    break;
2792 
2793 	  /* If we have a store, the accesses are adjacent.  This splits
2794 	     groups into chunks we support (we don't support vectorization
2795 	     of stores with gaps).  */
2796 	  if (!DR_IS_READ (dra)
2797 	      && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2798 					     (DR_INIT (datarefs_copy[i-1]))
2799 		  != type_size_a))
2800 	    break;
2801 
2802 	  /* If the step (if not zero or non-constant) is greater than the
2803 	     difference between data-refs' inits this splits groups into
2804 	     suitable sizes.  */
2805 	  if (tree_fits_shwi_p (DR_STEP (dra)))
2806 	    {
2807 	      HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2808 	      if (step != 0 && step <= (init_b - init_a))
2809 		break;
2810 	    }
2811 
2812 	  if (dump_enabled_p ())
2813 	    {
2814 	      dump_printf_loc (MSG_NOTE, vect_location,
2815 			       "Detected interleaving ");
2816 	      if (DR_IS_READ (dra))
2817 		dump_printf (MSG_NOTE, "load ");
2818 	      else
2819 		dump_printf (MSG_NOTE, "store ");
2820 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2821 	      dump_printf (MSG_NOTE,  " and ");
2822 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2823 	      dump_printf (MSG_NOTE, "\n");
2824 	    }
2825 
2826 	  /* Link the found element into the group list.  */
2827 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2828 	    {
2829 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2830 	      lastinfo = stmtinfo_a;
2831 	    }
2832 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2833 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2834 	  lastinfo = stmtinfo_b;
2835 	}
2836     }
2837 
2838   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2839     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2840         && !vect_analyze_data_ref_access (dr))
2841       {
2842 	if (dump_enabled_p ())
2843 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2844 	                   "not vectorized: complicated access pattern.\n");
2845 
2846         if (is_a <bb_vec_info> (vinfo))
2847           {
2848             /* Mark the statement as not vectorizable.  */
2849             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2850             continue;
2851           }
2852         else
2853 	  {
2854 	    datarefs_copy.release ();
2855 	    return false;
2856 	  }
2857       }
2858 
2859   datarefs_copy.release ();
2860   return true;
2861 }
2862 
2863 
2864 /* Operator == between two dr_with_seg_len objects.
2865 
2866    This equality operator is used to make sure two data refs
2867    are the same one so that we will consider to combine the
2868    aliasing checks of those two pairs of data dependent data
2869    refs.  */
2870 
2871 static bool
2872 operator == (const dr_with_seg_len& d1,
2873 	     const dr_with_seg_len& d2)
2874 {
2875   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2876 			  DR_BASE_ADDRESS (d2.dr), 0)
2877 	   && compare_tree (d1.offset, d2.offset) == 0
2878 	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2879 }
2880 
2881 /* Function comp_dr_with_seg_len_pair.
2882 
2883    Comparison function for sorting objects of dr_with_seg_len_pair_t
2884    so that we can combine aliasing checks in one scan.  */
2885 
2886 static int
2887 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2888 {
2889   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2890   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2891 
2892   const dr_with_seg_len &p11 = p1->first,
2893 			&p12 = p1->second,
2894 			&p21 = p2->first,
2895 			&p22 = p2->second;
2896 
2897   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2898      if a and c have the same basic address snd step, and b and d have the same
2899      address and step.  Therefore, if any a&c or b&d don't have the same address
2900      and step, we don't care the order of those two pairs after sorting.  */
2901   int comp_res;
2902 
2903   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2904 				DR_BASE_ADDRESS (p21.dr))) != 0)
2905     return comp_res;
2906   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2907 				DR_BASE_ADDRESS (p22.dr))) != 0)
2908     return comp_res;
2909   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2910     return comp_res;
2911   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2912     return comp_res;
2913   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2914     return comp_res;
2915   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2916     return comp_res;
2917 
2918   return 0;
2919 }
2920 
2921 /* Function vect_vfa_segment_size.
2922 
2923    Create an expression that computes the size of segment
2924    that will be accessed for a data reference.  The functions takes into
2925    account that realignment loads may access one more vector.
2926 
2927    Input:
2928      DR: The data reference.
2929      LENGTH_FACTOR: segment length to consider.
2930 
2931    Return an expression whose value is the size of segment which will be
2932    accessed by DR.  */
2933 
2934 static tree
2935 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2936 {
2937   tree segment_length;
2938 
2939   if (integer_zerop (DR_STEP (dr)))
2940     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2941   else
2942     segment_length = size_binop (MULT_EXPR,
2943 				 fold_convert (sizetype, DR_STEP (dr)),
2944 				 fold_convert (sizetype, length_factor));
2945 
2946   if (vect_supportable_dr_alignment (dr, false)
2947 	== dr_explicit_realign_optimized)
2948     {
2949       tree vector_size = TYPE_SIZE_UNIT
2950 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2951 
2952       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2953     }
2954   return segment_length;
2955 }
2956 
2957 /* Function vect_prune_runtime_alias_test_list.
2958 
2959    Prune a list of ddrs to be tested at run-time by versioning for alias.
2960    Merge several alias checks into one if possible.
2961    Return FALSE if resulting list of ddrs is longer then allowed by
2962    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2963 
2964 bool
2965 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2966 {
2967   vec<ddr_p> may_alias_ddrs =
2968     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2969   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2970     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2971   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2972   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2973 
2974   ddr_p ddr;
2975   unsigned int i;
2976   tree length_factor;
2977 
2978   if (dump_enabled_p ())
2979     dump_printf_loc (MSG_NOTE, vect_location,
2980                      "=== vect_prune_runtime_alias_test_list ===\n");
2981 
2982   if (may_alias_ddrs.is_empty ())
2983     return true;
2984 
2985   /* Basically, for each pair of dependent data refs store_ptr_0
2986      and load_ptr_0, we create an expression:
2987 
2988      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2989      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2990 
2991      for aliasing checks.  However, in some cases we can decrease
2992      the number of checks by combining two checks into one.  For
2993      example, suppose we have another pair of data refs store_ptr_0
2994      and load_ptr_1, and if the following condition is satisfied:
2995 
2996      load_ptr_0 < load_ptr_1  &&
2997      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2998 
2999      (this condition means, in each iteration of vectorized loop,
3000      the accessed memory of store_ptr_0 cannot be between the memory
3001      of load_ptr_0 and load_ptr_1.)
3002 
3003      we then can use only the following expression to finish the
3004      alising checks between store_ptr_0 & load_ptr_0 and
3005      store_ptr_0 & load_ptr_1:
3006 
3007      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3008      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3009 
3010      Note that we only consider that load_ptr_0 and load_ptr_1 have the
3011      same basic address.  */
3012 
3013   comp_alias_ddrs.create (may_alias_ddrs.length ());
3014 
3015   /* First, we collect all data ref pairs for aliasing checks.  */
3016   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3017     {
3018       struct data_reference *dr_a, *dr_b;
3019       gimple *dr_group_first_a, *dr_group_first_b;
3020       tree segment_length_a, segment_length_b;
3021       gimple *stmt_a, *stmt_b;
3022 
3023       dr_a = DDR_A (ddr);
3024       stmt_a = DR_STMT (DDR_A (ddr));
3025       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3026       if (dr_group_first_a)
3027 	{
3028 	  stmt_a = dr_group_first_a;
3029 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3030 	}
3031 
3032       dr_b = DDR_B (ddr);
3033       stmt_b = DR_STMT (DDR_B (ddr));
3034       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3035       if (dr_group_first_b)
3036 	{
3037 	  stmt_b = dr_group_first_b;
3038 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3039 	}
3040 
3041       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3042 	length_factor = scalar_loop_iters;
3043       else
3044 	length_factor = size_int (vect_factor);
3045       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3046       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3047 
3048       dr_with_seg_len_pair_t dr_with_seg_len_pair
3049 	  (dr_with_seg_len (dr_a, segment_length_a),
3050 	   dr_with_seg_len (dr_b, segment_length_b));
3051 
3052       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
3053 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3054 
3055       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3056     }
3057 
3058   /* Second, we sort the collected data ref pairs so that we can scan
3059      them once to combine all possible aliasing checks.  */
3060   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3061 
3062   /* Third, we scan the sorted dr pairs and check if we can combine
3063      alias checks of two neighboring dr pairs.  */
3064   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3065     {
3066       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
3067       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3068 		      *dr_b1 = &comp_alias_ddrs[i-1].second,
3069 		      *dr_a2 = &comp_alias_ddrs[i].first,
3070 		      *dr_b2 = &comp_alias_ddrs[i].second;
3071 
3072       /* Remove duplicate data ref pairs.  */
3073       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3074 	{
3075 	  if (dump_enabled_p ())
3076 	    {
3077 	      dump_printf_loc (MSG_NOTE, vect_location,
3078 			       "found equal ranges ");
3079 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3080 				 DR_REF (dr_a1->dr));
3081 	      dump_printf (MSG_NOTE,  ", ");
3082 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3083 				 DR_REF (dr_b1->dr));
3084 	      dump_printf (MSG_NOTE,  " and ");
3085 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3086 				 DR_REF (dr_a2->dr));
3087 	      dump_printf (MSG_NOTE,  ", ");
3088 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3089 				 DR_REF (dr_b2->dr));
3090 	      dump_printf (MSG_NOTE, "\n");
3091 	    }
3092 
3093 	  comp_alias_ddrs.ordered_remove (i--);
3094 	  continue;
3095 	}
3096 
3097       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3098 	{
3099 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3100 	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
3101 	  if (*dr_a1 == *dr_a2)
3102 	    {
3103 	      std::swap (dr_a1, dr_b1);
3104 	      std::swap (dr_a2, dr_b2);
3105 	    }
3106 
3107 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3108 				DR_BASE_ADDRESS (dr_a2->dr),
3109 				0)
3110 	      || !tree_fits_shwi_p (dr_a1->offset)
3111 	      || !tree_fits_shwi_p (dr_a2->offset))
3112 	    continue;
3113 
3114 	  /* Make sure dr_a1 starts left of dr_a2.  */
3115 	  if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
3116 	    std::swap (*dr_a1, *dr_a2);
3117 
3118 	  unsigned HOST_WIDE_INT diff
3119 	    = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
3120 
3121 
3122 	  bool do_remove = false;
3123 
3124 	  /* If the left segment does not extend beyond the start of the
3125 	     right segment the new segment length is that of the right
3126 	     plus the segment distance.  */
3127 	  if (tree_fits_uhwi_p (dr_a1->seg_len)
3128 	      && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3129 	    {
3130 	      dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3131 					   size_int (diff));
3132 	      do_remove = true;
3133 	    }
3134 	  /* Generally the new segment length is the maximum of the
3135 	     left segment size and the right segment size plus the distance.
3136 	     ???  We can also build tree MAX_EXPR here but it's not clear this
3137 	     is profitable.  */
3138 	  else if (tree_fits_uhwi_p (dr_a1->seg_len)
3139 		   && tree_fits_uhwi_p (dr_a2->seg_len))
3140 	    {
3141 	      unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3142 	      unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3143 	      dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3144 	      do_remove = true;
3145 	    }
3146 	  /* Now we check if the following condition is satisfied:
3147 
3148 	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3149 
3150 	     where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
3151 	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3152 	     have to make a best estimation.  We can get the minimum value
3153 	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3154 	     then either of the following two conditions can guarantee the
3155 	     one above:
3156 
3157 	     1: DIFF <= MIN_SEG_LEN_B
3158 	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
3159 	  else
3160 	    {
3161 	      unsigned HOST_WIDE_INT min_seg_len_b
3162 		= (tree_fits_uhwi_p (dr_b1->seg_len)
3163 		   ? tree_to_uhwi (dr_b1->seg_len)
3164 		   : vect_factor);
3165 
3166 	      if (diff <= min_seg_len_b
3167 		  || (tree_fits_uhwi_p (dr_a1->seg_len)
3168 		      && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3169 		{
3170 		  dr_a1->seg_len = size_binop (PLUS_EXPR,
3171 					       dr_a2->seg_len, size_int (diff));
3172 		  do_remove = true;
3173 		}
3174 	    }
3175 
3176 	  if (do_remove)
3177 	    {
3178 	      if (dump_enabled_p ())
3179 		{
3180 		  dump_printf_loc (MSG_NOTE, vect_location,
3181 				   "merging ranges for ");
3182 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3183 		  dump_printf (MSG_NOTE,  ", ");
3184 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3185 		  dump_printf (MSG_NOTE,  " and ");
3186 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3187 		  dump_printf (MSG_NOTE,  ", ");
3188 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3189 		  dump_printf (MSG_NOTE, "\n");
3190 		}
3191 	      comp_alias_ddrs.ordered_remove (i--);
3192 	    }
3193 	}
3194     }
3195 
3196   dump_printf_loc (MSG_NOTE, vect_location,
3197 		   "improved number of alias checks from %d to %d\n",
3198 		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
3199   if ((int) comp_alias_ddrs.length () >
3200       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3201     return false;
3202 
3203   return true;
3204 }
3205 
3206 /* Check whether a non-affine read or write in stmt is suitable for gather load
3207    or scatter store and if so, return a builtin decl for that operation.  */
3208 
3209 tree
3210 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3211 			   tree *offp, int *scalep)
3212 {
3213   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3215   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3216   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3217   tree offtype = NULL_TREE;
3218   tree decl, base, off;
3219   machine_mode pmode;
3220   int punsignedp, reversep, pvolatilep = 0;
3221 
3222   base = DR_REF (dr);
3223   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3224      see if we can use the def stmt of the address.  */
3225   if (is_gimple_call (stmt)
3226       && gimple_call_internal_p (stmt)
3227       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3228 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3229       && TREE_CODE (base) == MEM_REF
3230       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3231       && integer_zerop (TREE_OPERAND (base, 1))
3232       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3233     {
3234       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3235       if (is_gimple_assign (def_stmt)
3236 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3237 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3238     }
3239 
3240   /* The gather and scatter builtins need address of the form
3241      loop_invariant + vector * {1, 2, 4, 8}
3242      or
3243      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3244      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3245      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3246      multiplications and additions in it.  To get a vector, we need
3247      a single SSA_NAME that will be defined in the loop and will
3248      contain everything that is not loop invariant and that can be
3249      vectorized.  The following code attempts to find such a preexistng
3250      SSA_NAME OFF and put the loop invariants into a tree BASE
3251      that can be gimplified before the loop.  */
3252   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3253 			      &punsignedp, &reversep, &pvolatilep, false);
3254   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3255 
3256   if (TREE_CODE (base) == MEM_REF)
3257     {
3258       if (!integer_zerop (TREE_OPERAND (base, 1)))
3259 	{
3260 	  if (off == NULL_TREE)
3261 	    {
3262 	      offset_int moff = mem_ref_offset (base);
3263 	      off = wide_int_to_tree (sizetype, moff);
3264 	    }
3265 	  else
3266 	    off = size_binop (PLUS_EXPR, off,
3267 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3268 	}
3269       base = TREE_OPERAND (base, 0);
3270     }
3271   else
3272     base = build_fold_addr_expr (base);
3273 
3274   if (off == NULL_TREE)
3275     off = size_zero_node;
3276 
3277   /* If base is not loop invariant, either off is 0, then we start with just
3278      the constant offset in the loop invariant BASE and continue with base
3279      as OFF, otherwise give up.
3280      We could handle that case by gimplifying the addition of base + off
3281      into some SSA_NAME and use that as off, but for now punt.  */
3282   if (!expr_invariant_in_loop_p (loop, base))
3283     {
3284       if (!integer_zerop (off))
3285 	return NULL_TREE;
3286       off = base;
3287       base = size_int (pbitpos / BITS_PER_UNIT);
3288     }
3289   /* Otherwise put base + constant offset into the loop invariant BASE
3290      and continue with OFF.  */
3291   else
3292     {
3293       base = fold_convert (sizetype, base);
3294       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3295     }
3296 
3297   /* OFF at this point may be either a SSA_NAME or some tree expression
3298      from get_inner_reference.  Try to peel off loop invariants from it
3299      into BASE as long as possible.  */
3300   STRIP_NOPS (off);
3301   while (offtype == NULL_TREE)
3302     {
3303       enum tree_code code;
3304       tree op0, op1, add = NULL_TREE;
3305 
3306       if (TREE_CODE (off) == SSA_NAME)
3307 	{
3308 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3309 
3310 	  if (expr_invariant_in_loop_p (loop, off))
3311 	    return NULL_TREE;
3312 
3313 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3314 	    break;
3315 
3316 	  op0 = gimple_assign_rhs1 (def_stmt);
3317 	  code = gimple_assign_rhs_code (def_stmt);
3318 	  op1 = gimple_assign_rhs2 (def_stmt);
3319 	}
3320       else
3321 	{
3322 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3323 	    return NULL_TREE;
3324 	  code = TREE_CODE (off);
3325 	  extract_ops_from_tree (off, &code, &op0, &op1);
3326 	}
3327       switch (code)
3328 	{
3329 	case POINTER_PLUS_EXPR:
3330 	case PLUS_EXPR:
3331 	  if (expr_invariant_in_loop_p (loop, op0))
3332 	    {
3333 	      add = op0;
3334 	      off = op1;
3335 	    do_add:
3336 	      add = fold_convert (sizetype, add);
3337 	      if (scale != 1)
3338 		add = size_binop (MULT_EXPR, add, size_int (scale));
3339 	      base = size_binop (PLUS_EXPR, base, add);
3340 	      continue;
3341 	    }
3342 	  if (expr_invariant_in_loop_p (loop, op1))
3343 	    {
3344 	      add = op1;
3345 	      off = op0;
3346 	      goto do_add;
3347 	    }
3348 	  break;
3349 	case MINUS_EXPR:
3350 	  if (expr_invariant_in_loop_p (loop, op1))
3351 	    {
3352 	      add = fold_convert (sizetype, op1);
3353 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3354 	      off = op0;
3355 	      goto do_add;
3356 	    }
3357 	  break;
3358 	case MULT_EXPR:
3359 	  if (scale == 1 && tree_fits_shwi_p (op1))
3360 	    {
3361 	      scale = tree_to_shwi (op1);
3362 	      off = op0;
3363 	      continue;
3364 	    }
3365 	  break;
3366 	case SSA_NAME:
3367 	  off = op0;
3368 	  continue;
3369 	CASE_CONVERT:
3370 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3371 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3372 	    break;
3373 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3374 	      == TYPE_PRECISION (TREE_TYPE (off)))
3375 	    {
3376 	      off = op0;
3377 	      continue;
3378 	    }
3379 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3380 	      < TYPE_PRECISION (TREE_TYPE (off)))
3381 	    {
3382 	      off = op0;
3383 	      offtype = TREE_TYPE (off);
3384 	      STRIP_NOPS (off);
3385 	      continue;
3386 	    }
3387 	  break;
3388 	default:
3389 	  break;
3390 	}
3391       break;
3392     }
3393 
3394   /* If at the end OFF still isn't a SSA_NAME or isn't
3395      defined in the loop, punt.  */
3396   if (TREE_CODE (off) != SSA_NAME
3397       || expr_invariant_in_loop_p (loop, off))
3398     return NULL_TREE;
3399 
3400   if (offtype == NULL_TREE)
3401     offtype = TREE_TYPE (off);
3402 
3403   if (DR_IS_READ (dr))
3404     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3405 					     offtype, scale);
3406   else
3407     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3408 					      offtype, scale);
3409 
3410   if (decl == NULL_TREE)
3411     return NULL_TREE;
3412 
3413   if (basep)
3414     *basep = base;
3415   if (offp)
3416     *offp = off;
3417   if (scalep)
3418     *scalep = scale;
3419   return decl;
3420 }
3421 
3422 /* Function vect_analyze_data_refs.
3423 
3424   Find all the data references in the loop or basic block.
3425 
3426    The general structure of the analysis of data refs in the vectorizer is as
3427    follows:
3428    1- vect_analyze_data_refs(loop/bb): call
3429       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3430       in the loop/bb and their dependences.
3431    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3432    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3433    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3434 
3435 */
3436 
3437 bool
3438 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3439 {
3440   struct loop *loop = NULL;
3441   unsigned int i;
3442   struct data_reference *dr;
3443   tree scalar_type;
3444 
3445   if (dump_enabled_p ())
3446     dump_printf_loc (MSG_NOTE, vect_location,
3447 		     "=== vect_analyze_data_refs ===\n");
3448 
3449   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3450     loop = LOOP_VINFO_LOOP (loop_vinfo);
3451 
3452   /* Go through the data-refs, check that the analysis succeeded.  Update
3453      pointer from stmt_vec_info struct to DR and vectype.  */
3454 
3455   vec<data_reference_p> datarefs = vinfo->datarefs;
3456   FOR_EACH_VEC_ELT (datarefs, i, dr)
3457     {
3458       gimple *stmt;
3459       stmt_vec_info stmt_info;
3460       tree base, offset, init;
3461       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3462       bool simd_lane_access = false;
3463       int vf;
3464 
3465 again:
3466       if (!dr || !DR_REF (dr))
3467         {
3468           if (dump_enabled_p ())
3469 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3470 	                     "not vectorized: unhandled data-ref\n");
3471           return false;
3472         }
3473 
3474       stmt = DR_STMT (dr);
3475       stmt_info = vinfo_for_stmt (stmt);
3476 
3477       /* Discard clobbers from the dataref vector.  We will remove
3478          clobber stmts during vectorization.  */
3479       if (gimple_clobber_p (stmt))
3480 	{
3481 	  free_data_ref (dr);
3482 	  if (i == datarefs.length () - 1)
3483 	    {
3484 	      datarefs.pop ();
3485 	      break;
3486 	    }
3487 	  datarefs.ordered_remove (i);
3488 	  dr = datarefs[i];
3489 	  goto again;
3490 	}
3491 
3492       /* Check that analysis of the data-ref succeeded.  */
3493       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3494 	  || !DR_STEP (dr))
3495         {
3496 	  bool maybe_gather
3497 	    = DR_IS_READ (dr)
3498 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3499 	      && targetm.vectorize.builtin_gather != NULL;
3500 	  bool maybe_scatter
3501 	    = DR_IS_WRITE (dr)
3502 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3503 	      && targetm.vectorize.builtin_scatter != NULL;
3504 	  bool maybe_simd_lane_access
3505 	    = is_a <loop_vec_info> (vinfo) && loop->simduid;
3506 
3507 	  /* If target supports vector gather loads or scatter stores, or if
3508 	     this might be a SIMD lane access, see if they can't be used.  */
3509 	  if (is_a <loop_vec_info> (vinfo)
3510 	      && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3511 	      && !nested_in_vect_loop_p (loop, stmt))
3512 	    {
3513 	      struct data_reference *newdr
3514 		= create_data_ref (NULL, loop_containing_stmt (stmt),
3515 				   DR_REF (dr), stmt, maybe_scatter ? false : true);
3516 	      gcc_assert (newdr != NULL && DR_REF (newdr));
3517 	      if (DR_BASE_ADDRESS (newdr)
3518 		  && DR_OFFSET (newdr)
3519 		  && DR_INIT (newdr)
3520 		  && DR_STEP (newdr)
3521 		  && integer_zerop (DR_STEP (newdr)))
3522 		{
3523 		  if (maybe_simd_lane_access)
3524 		    {
3525 		      tree off = DR_OFFSET (newdr);
3526 		      STRIP_NOPS (off);
3527 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3528 			  && TREE_CODE (off) == MULT_EXPR
3529 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3530 			{
3531 			  tree step = TREE_OPERAND (off, 1);
3532 			  off = TREE_OPERAND (off, 0);
3533 			  STRIP_NOPS (off);
3534 			  if (CONVERT_EXPR_P (off)
3535 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3536 									  0)))
3537 				 < TYPE_PRECISION (TREE_TYPE (off)))
3538 			    off = TREE_OPERAND (off, 0);
3539 			  if (TREE_CODE (off) == SSA_NAME)
3540 			    {
3541 			      gimple *def = SSA_NAME_DEF_STMT (off);
3542 			      tree reft = TREE_TYPE (DR_REF (newdr));
3543 			      if (is_gimple_call (def)
3544 				  && gimple_call_internal_p (def)
3545 				  && (gimple_call_internal_fn (def)
3546 				      == IFN_GOMP_SIMD_LANE))
3547 				{
3548 				  tree arg = gimple_call_arg (def, 0);
3549 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3550 				  arg = SSA_NAME_VAR (arg);
3551 				  if (arg == loop->simduid
3552 				      /* For now.  */
3553 				      && tree_int_cst_equal
3554 					   (TYPE_SIZE_UNIT (reft),
3555 					    step))
3556 				    {
3557 				      DR_OFFSET (newdr) = ssize_int (0);
3558 				      DR_STEP (newdr) = step;
3559 				      DR_ALIGNED_TO (newdr)
3560 					= size_int (BIGGEST_ALIGNMENT);
3561 				      dr = newdr;
3562 				      simd_lane_access = true;
3563 				    }
3564 				}
3565 			    }
3566 			}
3567 		    }
3568 		  if (!simd_lane_access && (maybe_gather || maybe_scatter))
3569 		    {
3570 		      dr = newdr;
3571 		      if (maybe_gather)
3572 			gatherscatter = GATHER;
3573 		      else
3574 			gatherscatter = SCATTER;
3575 		    }
3576 		}
3577 	      if (gatherscatter == SG_NONE && !simd_lane_access)
3578 		free_data_ref (newdr);
3579 	    }
3580 
3581 	  if (gatherscatter == SG_NONE && !simd_lane_access)
3582 	    {
3583 	      if (dump_enabled_p ())
3584 		{
3585 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586                                    "not vectorized: data ref analysis "
3587                                    "failed ");
3588 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3589                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3590 		}
3591 
3592 	      if (is_a <bb_vec_info> (vinfo))
3593 		break;
3594 
3595 	      return false;
3596 	    }
3597         }
3598 
3599       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3600         {
3601           if (dump_enabled_p ())
3602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603                              "not vectorized: base addr of dr is a "
3604                              "constant\n");
3605 
3606           if (is_a <bb_vec_info> (vinfo))
3607 	    break;
3608 
3609 	  if (gatherscatter != SG_NONE || simd_lane_access)
3610 	    free_data_ref (dr);
3611 	  return false;
3612         }
3613 
3614       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3615         {
3616           if (dump_enabled_p ())
3617             {
3618               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3619                                "not vectorized: volatile type ");
3620               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3621               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3622             }
3623 
3624           if (is_a <bb_vec_info> (vinfo))
3625 	    break;
3626 
3627           return false;
3628         }
3629 
3630       if (stmt_can_throw_internal (stmt))
3631         {
3632           if (dump_enabled_p ())
3633             {
3634               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635                                "not vectorized: statement can throw an "
3636                                "exception ");
3637               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3638               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3639             }
3640 
3641           if (is_a <bb_vec_info> (vinfo))
3642 	    break;
3643 
3644 	  if (gatherscatter != SG_NONE || simd_lane_access)
3645 	    free_data_ref (dr);
3646           return false;
3647         }
3648 
3649       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3650 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3651 	{
3652           if (dump_enabled_p ())
3653             {
3654               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655                                "not vectorized: statement is bitfield "
3656                                "access ");
3657               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3659             }
3660 
3661           if (is_a <bb_vec_info> (vinfo))
3662 	    break;
3663 
3664 	  if (gatherscatter != SG_NONE || simd_lane_access)
3665 	    free_data_ref (dr);
3666           return false;
3667 	}
3668 
3669       base = unshare_expr (DR_BASE_ADDRESS (dr));
3670       offset = unshare_expr (DR_OFFSET (dr));
3671       init = unshare_expr (DR_INIT (dr));
3672 
3673       if (is_gimple_call (stmt)
3674 	  && (!gimple_call_internal_p (stmt)
3675 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3676 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3677 	{
3678 	  if (dump_enabled_p ())
3679 	    {
3680 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3681 	                       "not vectorized: dr in a call ");
3682 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3684 	    }
3685 
3686 	  if (is_a <bb_vec_info> (vinfo))
3687 	    break;
3688 
3689 	  if (gatherscatter != SG_NONE || simd_lane_access)
3690 	    free_data_ref (dr);
3691 	  return false;
3692 	}
3693 
3694       /* Update DR field in stmt_vec_info struct.  */
3695 
3696       /* If the dataref is in an inner-loop of the loop that is considered for
3697 	 for vectorization, we also want to analyze the access relative to
3698 	 the outer-loop (DR contains information only relative to the
3699 	 inner-most enclosing loop).  We do that by building a reference to the
3700 	 first location accessed by the inner-loop, and analyze it relative to
3701 	 the outer-loop.  */
3702       if (loop && nested_in_vect_loop_p (loop, stmt))
3703 	{
3704 	  tree outer_step, outer_base, outer_init;
3705 	  HOST_WIDE_INT pbitsize, pbitpos;
3706 	  tree poffset;
3707 	  machine_mode pmode;
3708 	  int punsignedp, preversep, pvolatilep;
3709 	  affine_iv base_iv, offset_iv;
3710 	  tree dinit;
3711 
3712 	  /* Build a reference to the first location accessed by the
3713 	     inner-loop: *(BASE+INIT).  (The first location is actually
3714 	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3715           tree inner_base = build_fold_indirect_ref
3716                                 (fold_build_pointer_plus (base, init));
3717 
3718 	  if (dump_enabled_p ())
3719 	    {
3720 	      dump_printf_loc (MSG_NOTE, vect_location,
3721                                "analyze in outer-loop: ");
3722 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3723 	      dump_printf (MSG_NOTE, "\n");
3724 	    }
3725 
3726 	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3727 					    &poffset, &pmode, &punsignedp,
3728 					    &preversep, &pvolatilep, false);
3729 	  gcc_assert (outer_base != NULL_TREE);
3730 
3731 	  if (pbitpos % BITS_PER_UNIT != 0)
3732 	    {
3733 	      if (dump_enabled_p ())
3734 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735                                  "failed: bit offset alignment.\n");
3736 	      return false;
3737 	    }
3738 
3739 	  if (preversep)
3740 	    {
3741 	      if (dump_enabled_p ())
3742 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3743 				 "failed: reverse storage order.\n");
3744 	      return false;
3745 	    }
3746 
3747 	  outer_base = build_fold_addr_expr (outer_base);
3748 	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3749                           &base_iv, false))
3750 	    {
3751 	      if (dump_enabled_p ())
3752 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3753                                  "failed: evolution of base is not affine.\n");
3754 	      return false;
3755 	    }
3756 
3757 	  if (offset)
3758 	    {
3759 	      if (poffset)
3760 		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3761                                        poffset);
3762 	      else
3763 		poffset = offset;
3764 	    }
3765 
3766 	  if (!poffset)
3767 	    {
3768 	      offset_iv.base = ssize_int (0);
3769 	      offset_iv.step = ssize_int (0);
3770 	    }
3771 	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3772                                &offset_iv, false))
3773 	    {
3774 	      if (dump_enabled_p ())
3775 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3776                                  "evolution of offset is not affine.\n");
3777 	      return false;
3778 	    }
3779 
3780 	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3781 	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3782 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3783 	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3784 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3785 
3786 	  outer_step = size_binop (PLUS_EXPR,
3787 				fold_convert (ssizetype, base_iv.step),
3788 				fold_convert (ssizetype, offset_iv.step));
3789 
3790 	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3791 	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3792 	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3793 	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3794 	  STMT_VINFO_DR_OFFSET (stmt_info) =
3795 				fold_convert (ssizetype, offset_iv.base);
3796 	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3797 				size_int (highest_pow2_factor (offset_iv.base));
3798 
3799           if (dump_enabled_p ())
3800 	    {
3801 	      dump_printf_loc (MSG_NOTE, vect_location,
3802                                "\touter base_address: ");
3803 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3804                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3805 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3806 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3807                                  STMT_VINFO_DR_OFFSET (stmt_info));
3808 	      dump_printf (MSG_NOTE,
3809                            "\n\touter constant offset from base address: ");
3810 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3811                                  STMT_VINFO_DR_INIT (stmt_info));
3812 	      dump_printf (MSG_NOTE, "\n\touter step: ");
3813 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3814                                  STMT_VINFO_DR_STEP (stmt_info));
3815 	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3816 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3817                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3818 	      dump_printf (MSG_NOTE, "\n");
3819 	    }
3820 	}
3821 
3822       if (STMT_VINFO_DATA_REF (stmt_info))
3823         {
3824           if (dump_enabled_p ())
3825             {
3826               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3827                                "not vectorized: more than one data ref "
3828                                "in stmt: ");
3829               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3830               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3831             }
3832 
3833           if (is_a <bb_vec_info> (vinfo))
3834 	    break;
3835 
3836 	  if (gatherscatter != SG_NONE || simd_lane_access)
3837 	    free_data_ref (dr);
3838           return false;
3839         }
3840 
3841       STMT_VINFO_DATA_REF (stmt_info) = dr;
3842       if (simd_lane_access)
3843 	{
3844 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3845 	  free_data_ref (datarefs[i]);
3846 	  datarefs[i] = dr;
3847 	}
3848 
3849       /* Set vectype for STMT.  */
3850       scalar_type = TREE_TYPE (DR_REF (dr));
3851       STMT_VINFO_VECTYPE (stmt_info)
3852 	= get_vectype_for_scalar_type (scalar_type);
3853       if (!STMT_VINFO_VECTYPE (stmt_info))
3854         {
3855           if (dump_enabled_p ())
3856             {
3857               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3858                                "not vectorized: no vectype for stmt: ");
3859               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3860               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3861               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3862                                  scalar_type);
3863               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3864             }
3865 
3866           if (is_a <bb_vec_info> (vinfo))
3867 	    {
3868 	      /* No vector type is fine, the ref can still participate
3869 	         in dependence analysis, we just can't vectorize it.  */
3870 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3871 	      continue;
3872 	    }
3873 
3874 	  if (gatherscatter != SG_NONE || simd_lane_access)
3875 	    {
3876 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3877 	      if (gatherscatter != SG_NONE)
3878 		free_data_ref (dr);
3879 	    }
3880 	  return false;
3881         }
3882       else
3883 	{
3884 	  if (dump_enabled_p ())
3885 	    {
3886 	      dump_printf_loc (MSG_NOTE, vect_location,
3887 			       "got vectype for stmt: ");
3888 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3889 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3890 				 STMT_VINFO_VECTYPE (stmt_info));
3891 	      dump_printf (MSG_NOTE, "\n");
3892 	    }
3893 	}
3894 
3895       /* Adjust the minimal vectorization factor according to the
3896 	 vector type.  */
3897       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3898       if (vf > *min_vf)
3899 	*min_vf = vf;
3900 
3901       if (gatherscatter != SG_NONE)
3902 	{
3903 	  tree off;
3904 	  if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3905 					  NULL, &off, NULL)
3906 	      || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3907 	    {
3908 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3909 	      free_data_ref (dr);
3910 	      if (dump_enabled_p ())
3911 		{
3912 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3913 				   (gatherscatter == GATHER) ?
3914 				   "not vectorized: not suitable for gather "
3915 				   "load " :
3916 				   "not vectorized: not suitable for scatter "
3917 				   "store ");
3918 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3919                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3920 		}
3921 	      return false;
3922 	    }
3923 
3924 	  free_data_ref (datarefs[i]);
3925 	  datarefs[i] = dr;
3926 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3927 	}
3928 
3929       else if (is_a <loop_vec_info> (vinfo)
3930 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3931 	{
3932 	  if (nested_in_vect_loop_p (loop, stmt))
3933 	    {
3934 	      if (dump_enabled_p ())
3935 		{
3936 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3937                                    "not vectorized: not suitable for strided "
3938                                    "load ");
3939 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3940                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3941 		}
3942 	      return false;
3943 	    }
3944 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
3945 	}
3946     }
3947 
3948   /* If we stopped analysis at the first dataref we could not analyze
3949      when trying to vectorize a basic-block mark the rest of the datarefs
3950      as not vectorizable and truncate the vector of datarefs.  That
3951      avoids spending useless time in analyzing their dependence.  */
3952   if (i != datarefs.length ())
3953     {
3954       gcc_assert (is_a <bb_vec_info> (vinfo));
3955       for (unsigned j = i; j < datarefs.length (); ++j)
3956 	{
3957 	  data_reference_p dr = datarefs[j];
3958           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3959 	  free_data_ref (dr);
3960 	}
3961       datarefs.truncate (i);
3962     }
3963 
3964   return true;
3965 }
3966 
3967 
3968 /* Function vect_get_new_vect_var.
3969 
3970    Returns a name for a new variable.  The current naming scheme appends the
3971    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3972    the name of vectorizer generated variables, and appends that to NAME if
3973    provided.  */
3974 
3975 tree
3976 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3977 {
3978   const char *prefix;
3979   tree new_vect_var;
3980 
3981   switch (var_kind)
3982   {
3983   case vect_simple_var:
3984     prefix = "vect";
3985     break;
3986   case vect_scalar_var:
3987     prefix = "stmp";
3988     break;
3989   case vect_mask_var:
3990     prefix = "mask";
3991     break;
3992   case vect_pointer_var:
3993     prefix = "vectp";
3994     break;
3995   default:
3996     gcc_unreachable ();
3997   }
3998 
3999   if (name)
4000     {
4001       char* tmp = concat (prefix, "_", name, NULL);
4002       new_vect_var = create_tmp_reg (type, tmp);
4003       free (tmp);
4004     }
4005   else
4006     new_vect_var = create_tmp_reg (type, prefix);
4007 
4008   return new_vect_var;
4009 }
4010 
4011 /* Like vect_get_new_vect_var but return an SSA name.  */
4012 
4013 tree
4014 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4015 {
4016   const char *prefix;
4017   tree new_vect_var;
4018 
4019   switch (var_kind)
4020   {
4021   case vect_simple_var:
4022     prefix = "vect";
4023     break;
4024   case vect_scalar_var:
4025     prefix = "stmp";
4026     break;
4027   case vect_pointer_var:
4028     prefix = "vectp";
4029     break;
4030   default:
4031     gcc_unreachable ();
4032   }
4033 
4034   if (name)
4035     {
4036       char* tmp = concat (prefix, "_", name, NULL);
4037       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4038       free (tmp);
4039     }
4040   else
4041     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4042 
4043   return new_vect_var;
4044 }
4045 
4046 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4047 
4048 static void
4049 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4050 				  stmt_vec_info stmt_info)
4051 {
4052   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4053   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4054   int misalign = DR_MISALIGNMENT (dr);
4055   if (misalign == -1)
4056     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4057   else
4058     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4059 }
4060 
4061 /* Function vect_create_addr_base_for_vector_ref.
4062 
4063    Create an expression that computes the address of the first memory location
4064    that will be accessed for a data reference.
4065 
4066    Input:
4067    STMT: The statement containing the data reference.
4068    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4069    OFFSET: Optional. If supplied, it is be added to the initial address.
4070    LOOP:    Specify relative to which loop-nest should the address be computed.
4071             For example, when the dataref is in an inner-loop nested in an
4072 	    outer-loop that is now being vectorized, LOOP can be either the
4073 	    outer-loop, or the inner-loop.  The first memory location accessed
4074 	    by the following dataref ('in' points to short):
4075 
4076 		for (i=0; i<N; i++)
4077 		   for (j=0; j<M; j++)
4078 		     s += in[i+j]
4079 
4080 	    is as follows:
4081 	    if LOOP=i_loop:	&in		(relative to i_loop)
4082 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4083    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4084 	    initial address.  Unlike OFFSET, which is number of elements to
4085 	    be added, BYTE_OFFSET is measured in bytes.
4086 
4087    Output:
4088    1. Return an SSA_NAME whose value is the address of the memory location of
4089       the first vector of the data reference.
4090    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4091       these statement(s) which define the returned SSA_NAME.
4092 
4093    FORNOW: We are only handling array accesses with step 1.  */
4094 
4095 tree
4096 vect_create_addr_base_for_vector_ref (gimple *stmt,
4097 				      gimple_seq *new_stmt_list,
4098 				      tree offset,
4099 				      struct loop *loop,
4100 				      tree byte_offset)
4101 {
4102   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4103   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4104   tree data_ref_base;
4105   const char *base_name;
4106   tree addr_base;
4107   tree dest;
4108   gimple_seq seq = NULL;
4109   tree base_offset;
4110   tree init;
4111   tree vect_ptr_type;
4112   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4113   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4114 
4115   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4116     {
4117       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4118 
4119       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4120 
4121       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4122       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4123       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4124     }
4125   else
4126     {
4127       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4128       base_offset = unshare_expr (DR_OFFSET (dr));
4129       init = unshare_expr (DR_INIT (dr));
4130     }
4131 
4132   if (loop_vinfo)
4133     base_name = get_name (data_ref_base);
4134   else
4135     {
4136       base_offset = ssize_int (0);
4137       init = ssize_int (0);
4138       base_name = get_name (DR_REF (dr));
4139     }
4140 
4141   /* Create base_offset */
4142   base_offset = size_binop (PLUS_EXPR,
4143 			    fold_convert (sizetype, base_offset),
4144 			    fold_convert (sizetype, init));
4145 
4146   if (offset)
4147     {
4148       offset = fold_build2 (MULT_EXPR, sizetype,
4149 			    fold_convert (sizetype, offset), step);
4150       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4151 				 base_offset, offset);
4152     }
4153   if (byte_offset)
4154     {
4155       byte_offset = fold_convert (sizetype, byte_offset);
4156       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4157 				 base_offset, byte_offset);
4158     }
4159 
4160   /* base + base_offset */
4161   if (loop_vinfo)
4162     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4163   else
4164     {
4165       addr_base = build1 (ADDR_EXPR,
4166 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4167 			  unshare_expr (DR_REF (dr)));
4168     }
4169 
4170   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4171   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4172   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4173   gimple_seq_add_seq (new_stmt_list, seq);
4174 
4175   if (DR_PTR_INFO (dr)
4176       && TREE_CODE (addr_base) == SSA_NAME
4177       && !SSA_NAME_PTR_INFO (addr_base))
4178     {
4179       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4180       if (offset || byte_offset)
4181 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4182     }
4183 
4184   if (dump_enabled_p ())
4185     {
4186       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4187       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4188       dump_printf (MSG_NOTE, "\n");
4189     }
4190 
4191   return addr_base;
4192 }
4193 
4194 
4195 /* Function vect_create_data_ref_ptr.
4196 
4197    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4198    location accessed in the loop by STMT, along with the def-use update
4199    chain to appropriately advance the pointer through the loop iterations.
4200    Also set aliasing information for the pointer.  This pointer is used by
4201    the callers to this function to create a memory reference expression for
4202    vector load/store access.
4203 
4204    Input:
4205    1. STMT: a stmt that references memory. Expected to be of the form
4206          GIMPLE_ASSIGN <name, data-ref> or
4207 	 GIMPLE_ASSIGN <data-ref, name>.
4208    2. AGGR_TYPE: the type of the reference, which should be either a vector
4209         or an array.
4210    3. AT_LOOP: the loop where the vector memref is to be created.
4211    4. OFFSET (optional): an offset to be added to the initial address accessed
4212         by the data-ref in STMT.
4213    5. BSI: location where the new stmts are to be placed if there is no loop
4214    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4215         pointing to the initial address.
4216    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4217 	to the initial address accessed by the data-ref in STMT.  This is
4218 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4219 	in bytes.
4220 
4221    Output:
4222    1. Declare a new ptr to vector_type, and have it point to the base of the
4223       data reference (initial addressed accessed by the data reference).
4224       For example, for vector of type V8HI, the following code is generated:
4225 
4226       v8hi *ap;
4227       ap = (v8hi *)initial_address;
4228 
4229       if OFFSET is not supplied:
4230          initial_address = &a[init];
4231       if OFFSET is supplied:
4232          initial_address = &a[init + OFFSET];
4233       if BYTE_OFFSET is supplied:
4234 	 initial_address = &a[init] + BYTE_OFFSET;
4235 
4236       Return the initial_address in INITIAL_ADDRESS.
4237 
4238    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4239       update the pointer in each iteration of the loop.
4240 
4241       Return the increment stmt that updates the pointer in PTR_INCR.
4242 
4243    3. Set INV_P to true if the access pattern of the data reference in the
4244       vectorized loop is invariant.  Set it to false otherwise.
4245 
4246    4. Return the pointer.  */
4247 
4248 tree
4249 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4250 			  tree offset, tree *initial_address,
4251 			  gimple_stmt_iterator *gsi, gimple **ptr_incr,
4252 			  bool only_init, bool *inv_p, tree byte_offset)
4253 {
4254   const char *base_name;
4255   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4256   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4257   struct loop *loop = NULL;
4258   bool nested_in_vect_loop = false;
4259   struct loop *containing_loop = NULL;
4260   tree aggr_ptr_type;
4261   tree aggr_ptr;
4262   tree new_temp;
4263   gimple_seq new_stmt_list = NULL;
4264   edge pe = NULL;
4265   basic_block new_bb;
4266   tree aggr_ptr_init;
4267   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4268   tree aptr;
4269   gimple_stmt_iterator incr_gsi;
4270   bool insert_after;
4271   tree indx_before_incr, indx_after_incr;
4272   gimple *incr;
4273   tree step;
4274   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4275 
4276   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4277 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4278 
4279   if (loop_vinfo)
4280     {
4281       loop = LOOP_VINFO_LOOP (loop_vinfo);
4282       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4283       containing_loop = (gimple_bb (stmt))->loop_father;
4284       pe = loop_preheader_edge (loop);
4285     }
4286   else
4287     {
4288       gcc_assert (bb_vinfo);
4289       only_init = true;
4290       *ptr_incr = NULL;
4291     }
4292 
4293   /* Check the step (evolution) of the load in LOOP, and record
4294      whether it's invariant.  */
4295   if (nested_in_vect_loop)
4296     step = STMT_VINFO_DR_STEP (stmt_info);
4297   else
4298     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4299 
4300   if (integer_zerop (step))
4301     *inv_p = true;
4302   else
4303     *inv_p = false;
4304 
4305   /* Create an expression for the first address accessed by this load
4306      in LOOP.  */
4307   base_name = get_name (DR_BASE_ADDRESS (dr));
4308 
4309   if (dump_enabled_p ())
4310     {
4311       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4312       dump_printf_loc (MSG_NOTE, vect_location,
4313                        "create %s-pointer variable to type: ",
4314 		       get_tree_code_name (TREE_CODE (aggr_type)));
4315       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4316       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4317         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4318       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4319         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4320       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4321         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4322       else
4323         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4324       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4325       dump_printf (MSG_NOTE, "\n");
4326     }
4327 
4328   /* (1) Create the new aggregate-pointer variable.
4329      Vector and array types inherit the alias set of their component
4330      type by default so we need to use a ref-all pointer if the data
4331      reference does not conflict with the created aggregated data
4332      reference because it is not addressable.  */
4333   bool need_ref_all = false;
4334   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4335 			      get_alias_set (DR_REF (dr))))
4336     need_ref_all = true;
4337   /* Likewise for any of the data references in the stmt group.  */
4338   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4339     {
4340       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4341       do
4342 	{
4343 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4344 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4345 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4346 				      get_alias_set (DR_REF (sdr))))
4347 	    {
4348 	      need_ref_all = true;
4349 	      break;
4350 	    }
4351 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4352 	}
4353       while (orig_stmt);
4354     }
4355   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4356 					       need_ref_all);
4357   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4358 
4359 
4360   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4361      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4362      def-use update cycles for the pointer: one relative to the outer-loop
4363      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4364      to the inner-loop (which is the inner-most loop containing the dataref),
4365      and this is done be step (5) below.
4366 
4367      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4368      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4369      redundant.  Steps (3),(4) create the following:
4370 
4371 	vp0 = &base_addr;
4372 	LOOP:	vp1 = phi(vp0,vp2)
4373 		...
4374 		...
4375 		vp2 = vp1 + step
4376 		goto LOOP
4377 
4378      If there is an inner-loop nested in loop, then step (5) will also be
4379      applied, and an additional update in the inner-loop will be created:
4380 
4381 	vp0 = &base_addr;
4382 	LOOP:   vp1 = phi(vp0,vp2)
4383 		...
4384         inner:     vp3 = phi(vp1,vp4)
4385 	           vp4 = vp3 + inner_step
4386 	           if () goto inner
4387 		...
4388 		vp2 = vp1 + step
4389 		if () goto LOOP   */
4390 
4391   /* (2) Calculate the initial address of the aggregate-pointer, and set
4392      the aggregate-pointer to point to it before the loop.  */
4393 
4394   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4395 
4396   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4397 						   offset, loop, byte_offset);
4398   if (new_stmt_list)
4399     {
4400       if (pe)
4401         {
4402           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4403           gcc_assert (!new_bb);
4404         }
4405       else
4406         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4407     }
4408 
4409   *initial_address = new_temp;
4410   aggr_ptr_init = new_temp;
4411 
4412   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4413      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4414      inner-loop nested in LOOP (during outer-loop vectorization).  */
4415 
4416   /* No update in loop is required.  */
4417   if (only_init && (!loop_vinfo || at_loop == loop))
4418     aptr = aggr_ptr_init;
4419   else
4420     {
4421       /* The step of the aggregate pointer is the type size.  */
4422       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4423       /* One exception to the above is when the scalar step of the load in
4424 	 LOOP is zero. In this case the step here is also zero.  */
4425       if (*inv_p)
4426 	iv_step = size_zero_node;
4427       else if (tree_int_cst_sgn (step) == -1)
4428 	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4429 
4430       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4431 
4432       create_iv (aggr_ptr_init,
4433 		 fold_convert (aggr_ptr_type, iv_step),
4434 		 aggr_ptr, loop, &incr_gsi, insert_after,
4435 		 &indx_before_incr, &indx_after_incr);
4436       incr = gsi_stmt (incr_gsi);
4437       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4438 
4439       /* Copy the points-to information if it exists. */
4440       if (DR_PTR_INFO (dr))
4441 	{
4442 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4443 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4444 	}
4445       if (ptr_incr)
4446 	*ptr_incr = incr;
4447 
4448       aptr = indx_before_incr;
4449     }
4450 
4451   if (!nested_in_vect_loop || only_init)
4452     return aptr;
4453 
4454 
4455   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4456      nested in LOOP, if exists.  */
4457 
4458   gcc_assert (nested_in_vect_loop);
4459   if (!only_init)
4460     {
4461       standard_iv_increment_position (containing_loop, &incr_gsi,
4462 				      &insert_after);
4463       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4464 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4465 		 &indx_after_incr);
4466       incr = gsi_stmt (incr_gsi);
4467       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4468 
4469       /* Copy the points-to information if it exists. */
4470       if (DR_PTR_INFO (dr))
4471 	{
4472 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4473 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4474 	}
4475       if (ptr_incr)
4476 	*ptr_incr = incr;
4477 
4478       return indx_before_incr;
4479     }
4480   else
4481     gcc_unreachable ();
4482 }
4483 
4484 
4485 /* Function bump_vector_ptr
4486 
4487    Increment a pointer (to a vector type) by vector-size. If requested,
4488    i.e. if PTR-INCR is given, then also connect the new increment stmt
4489    to the existing def-use update-chain of the pointer, by modifying
4490    the PTR_INCR as illustrated below:
4491 
4492    The pointer def-use update-chain before this function:
4493                         DATAREF_PTR = phi (p_0, p_2)
4494                         ....
4495         PTR_INCR:       p_2 = DATAREF_PTR + step
4496 
4497    The pointer def-use update-chain after this function:
4498                         DATAREF_PTR = phi (p_0, p_2)
4499                         ....
4500                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4501                         ....
4502         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4503 
4504    Input:
4505    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4506                  in the loop.
4507    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4508 	      the loop.  The increment amount across iterations is expected
4509 	      to be vector_size.
4510    BSI - location where the new update stmt is to be placed.
4511    STMT - the original scalar memory-access stmt that is being vectorized.
4512    BUMP - optional. The offset by which to bump the pointer. If not given,
4513 	  the offset is assumed to be vector_size.
4514 
4515    Output: Return NEW_DATAREF_PTR as illustrated above.
4516 
4517 */
4518 
4519 tree
4520 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4521 		 gimple *stmt, tree bump)
4522 {
4523   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4524   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4525   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4526   tree update = TYPE_SIZE_UNIT (vectype);
4527   gassign *incr_stmt;
4528   ssa_op_iter iter;
4529   use_operand_p use_p;
4530   tree new_dataref_ptr;
4531 
4532   if (bump)
4533     update = bump;
4534 
4535   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4536     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4537   else
4538     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4539   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4540 				   dataref_ptr, update);
4541   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4542 
4543   /* Copy the points-to information if it exists. */
4544   if (DR_PTR_INFO (dr))
4545     {
4546       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4547       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4548     }
4549 
4550   if (!ptr_incr)
4551     return new_dataref_ptr;
4552 
4553   /* Update the vector-pointer's cross-iteration increment.  */
4554   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4555     {
4556       tree use = USE_FROM_PTR (use_p);
4557 
4558       if (use == dataref_ptr)
4559         SET_USE (use_p, new_dataref_ptr);
4560       else
4561         gcc_assert (tree_int_cst_compare (use, update) == 0);
4562     }
4563 
4564   return new_dataref_ptr;
4565 }
4566 
4567 
4568 /* Function vect_create_destination_var.
4569 
4570    Create a new temporary of type VECTYPE.  */
4571 
4572 tree
4573 vect_create_destination_var (tree scalar_dest, tree vectype)
4574 {
4575   tree vec_dest;
4576   const char *name;
4577   char *new_name;
4578   tree type;
4579   enum vect_var_kind kind;
4580 
4581   kind = vectype
4582     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4583     ? vect_mask_var
4584     : vect_simple_var
4585     : vect_scalar_var;
4586   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4587 
4588   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4589 
4590   name = get_name (scalar_dest);
4591   if (name)
4592     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4593   else
4594     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4595   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4596   free (new_name);
4597 
4598   return vec_dest;
4599 }
4600 
4601 /* Function vect_grouped_store_supported.
4602 
4603    Returns TRUE if interleave high and interleave low permutations
4604    are supported, and FALSE otherwise.  */
4605 
4606 bool
4607 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4608 {
4609   machine_mode mode = TYPE_MODE (vectype);
4610 
4611   /* vect_permute_store_chain requires the group size to be equal to 3 or
4612      be a power of two.  */
4613   if (count != 3 && exact_log2 (count) == -1)
4614     {
4615       if (dump_enabled_p ())
4616 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4617 			 "the size of the group of accesses"
4618 			 " is not a power of 2 or not eqaul to 3\n");
4619       return false;
4620     }
4621 
4622   /* Check that the permutation is supported.  */
4623   if (VECTOR_MODE_P (mode))
4624     {
4625       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4626       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4627 
4628       if (count == 3)
4629 	{
4630 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
4631 	  unsigned int i, j;
4632 
4633 	  for (j = 0; j < 3; j++)
4634 	    {
4635 	      int nelt0 = ((3 - j) * nelt) % 3;
4636 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
4637 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
4638 	      for (i = 0; i < nelt; i++)
4639 		{
4640 		  if (3 * i + nelt0 < nelt)
4641 		    sel[3 * i + nelt0] = j0++;
4642 		  if (3 * i + nelt1 < nelt)
4643 		    sel[3 * i + nelt1] = nelt + j1++;
4644 		  if (3 * i + nelt2 < nelt)
4645 		    sel[3 * i + nelt2] = 0;
4646 		}
4647 	      if (!can_vec_perm_p (mode, false, sel))
4648 		{
4649 		  if (dump_enabled_p ())
4650 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4651 				 "permutaion op not supported by target.\n");
4652 		  return false;
4653 		}
4654 
4655 	      for (i = 0; i < nelt; i++)
4656 		{
4657 		  if (3 * i + nelt0 < nelt)
4658 		    sel[3 * i + nelt0] = 3 * i + nelt0;
4659 		  if (3 * i + nelt1 < nelt)
4660 		    sel[3 * i + nelt1] = 3 * i + nelt1;
4661 		  if (3 * i + nelt2 < nelt)
4662 		    sel[3 * i + nelt2] = nelt + j2++;
4663 		}
4664 	      if (!can_vec_perm_p (mode, false, sel))
4665 		{
4666 		  if (dump_enabled_p ())
4667 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4668 				 "permutaion op not supported by target.\n");
4669 		  return false;
4670 		}
4671 	    }
4672 	  return true;
4673 	}
4674       else
4675 	{
4676 	  /* If length is not equal to 3 then only power of 2 is supported.  */
4677 	  gcc_assert (exact_log2 (count) != -1);
4678 
4679 	  for (i = 0; i < nelt / 2; i++)
4680 	    {
4681 	      sel[i * 2] = i;
4682 	      sel[i * 2 + 1] = i + nelt;
4683 	    }
4684 	    if (can_vec_perm_p (mode, false, sel))
4685 	      {
4686 		for (i = 0; i < nelt; i++)
4687 		  sel[i] += nelt / 2;
4688 		if (can_vec_perm_p (mode, false, sel))
4689 		  return true;
4690 	      }
4691 	}
4692     }
4693 
4694   if (dump_enabled_p ())
4695     dump_printf (MSG_MISSED_OPTIMIZATION,
4696 		 "permutaion op not supported by target.\n");
4697   return false;
4698 }
4699 
4700 
4701 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4702    type VECTYPE.  */
4703 
4704 bool
4705 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4706 {
4707   return vect_lanes_optab_supported_p ("vec_store_lanes",
4708 				       vec_store_lanes_optab,
4709 				       vectype, count);
4710 }
4711 
4712 
4713 /* Function vect_permute_store_chain.
4714 
4715    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4716    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4717    the data correctly for the stores.  Return the final references for stores
4718    in RESULT_CHAIN.
4719 
4720    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4721    The input is 4 vectors each containing 8 elements.  We assign a number to
4722    each element, the input sequence is:
4723 
4724    1st vec:   0  1  2  3  4  5  6  7
4725    2nd vec:   8  9 10 11 12 13 14 15
4726    3rd vec:  16 17 18 19 20 21 22 23
4727    4th vec:  24 25 26 27 28 29 30 31
4728 
4729    The output sequence should be:
4730 
4731    1st vec:  0  8 16 24  1  9 17 25
4732    2nd vec:  2 10 18 26  3 11 19 27
4733    3rd vec:  4 12 20 28  5 13 21 30
4734    4th vec:  6 14 22 30  7 15 23 31
4735 
4736    i.e., we interleave the contents of the four vectors in their order.
4737 
4738    We use interleave_high/low instructions to create such output.  The input of
4739    each interleave_high/low operation is two vectors:
4740    1st vec    2nd vec
4741    0 1 2 3    4 5 6 7
4742    the even elements of the result vector are obtained left-to-right from the
4743    high/low elements of the first vector.  The odd elements of the result are
4744    obtained left-to-right from the high/low elements of the second vector.
4745    The output of interleave_high will be:   0 4 1 5
4746    and of interleave_low:                   2 6 3 7
4747 
4748 
4749    The permutation is done in log LENGTH stages.  In each stage interleave_high
4750    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4751    where the first argument is taken from the first half of DR_CHAIN and the
4752    second argument from it's second half.
4753    In our example,
4754 
4755    I1: interleave_high (1st vec, 3rd vec)
4756    I2: interleave_low (1st vec, 3rd vec)
4757    I3: interleave_high (2nd vec, 4th vec)
4758    I4: interleave_low (2nd vec, 4th vec)
4759 
4760    The output for the first stage is:
4761 
4762    I1:  0 16  1 17  2 18  3 19
4763    I2:  4 20  5 21  6 22  7 23
4764    I3:  8 24  9 25 10 26 11 27
4765    I4: 12 28 13 29 14 30 15 31
4766 
4767    The output of the second stage, i.e. the final result is:
4768 
4769    I1:  0  8 16 24  1  9 17 25
4770    I2:  2 10 18 26  3 11 19 27
4771    I3:  4 12 20 28  5 13 21 30
4772    I4:  6 14 22 30  7 15 23 31.  */
4773 
4774 void
4775 vect_permute_store_chain (vec<tree> dr_chain,
4776 			  unsigned int length,
4777 			  gimple *stmt,
4778 			  gimple_stmt_iterator *gsi,
4779 			  vec<tree> *result_chain)
4780 {
4781   tree vect1, vect2, high, low;
4782   gimple *perm_stmt;
4783   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4784   tree perm_mask_low, perm_mask_high;
4785   tree data_ref;
4786   tree perm3_mask_low, perm3_mask_high;
4787   unsigned int i, n, log_length = exact_log2 (length);
4788   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4789   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4790 
4791   result_chain->quick_grow (length);
4792   memcpy (result_chain->address (), dr_chain.address (),
4793 	  length * sizeof (tree));
4794 
4795   if (length == 3)
4796     {
4797       unsigned int j0 = 0, j1 = 0, j2 = 0;
4798 
4799       for (j = 0; j < 3; j++)
4800         {
4801 	  int nelt0 = ((3 - j) * nelt) % 3;
4802 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
4803 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
4804 
4805 	  for (i = 0; i < nelt; i++)
4806 	    {
4807 	      if (3 * i + nelt0 < nelt)
4808 		sel[3 * i + nelt0] = j0++;
4809 	      if (3 * i + nelt1 < nelt)
4810 		sel[3 * i + nelt1] = nelt + j1++;
4811 	      if (3 * i + nelt2 < nelt)
4812 		sel[3 * i + nelt2] = 0;
4813 	    }
4814 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4815 
4816 	  for (i = 0; i < nelt; i++)
4817 	    {
4818 	      if (3 * i + nelt0 < nelt)
4819 		sel[3 * i + nelt0] = 3 * i + nelt0;
4820 	      if (3 * i + nelt1 < nelt)
4821 		sel[3 * i + nelt1] = 3 * i + nelt1;
4822 	      if (3 * i + nelt2 < nelt)
4823 		sel[3 * i + nelt2] = nelt + j2++;
4824 	    }
4825 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4826 
4827 	  vect1 = dr_chain[0];
4828 	  vect2 = dr_chain[1];
4829 
4830 	  /* Create interleaving stmt:
4831 	     low = VEC_PERM_EXPR <vect1, vect2,
4832 				  {j, nelt, *, j + 1, nelt + j + 1, *,
4833 				   j + 2, nelt + j + 2, *, ...}>  */
4834 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4835 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4836 					   vect2, perm3_mask_low);
4837 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4838 
4839 	  vect1 = data_ref;
4840 	  vect2 = dr_chain[2];
4841 	  /* Create interleaving stmt:
4842 	     low = VEC_PERM_EXPR <vect1, vect2,
4843 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
4844 				   6, 7, nelt + j + 2, ...}>  */
4845 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4846 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4847 					   vect2, perm3_mask_high);
4848 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4849 	  (*result_chain)[j] = data_ref;
4850 	}
4851     }
4852   else
4853     {
4854       /* If length is not equal to 3 then only power of 2 is supported.  */
4855       gcc_assert (exact_log2 (length) != -1);
4856 
4857       for (i = 0, n = nelt / 2; i < n; i++)
4858 	{
4859 	  sel[i * 2] = i;
4860 	  sel[i * 2 + 1] = i + nelt;
4861 	}
4862 	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4863 
4864 	for (i = 0; i < nelt; i++)
4865 	  sel[i] += nelt / 2;
4866 	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4867 
4868 	for (i = 0, n = log_length; i < n; i++)
4869 	  {
4870 	    for (j = 0; j < length/2; j++)
4871 	      {
4872 		vect1 = dr_chain[j];
4873 		vect2 = dr_chain[j+length/2];
4874 
4875 		/* Create interleaving stmt:
4876 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4877 							...}>  */
4878 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4879 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4880 						 vect2, perm_mask_high);
4881 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4882 		(*result_chain)[2*j] = high;
4883 
4884 		/* Create interleaving stmt:
4885 		   low = VEC_PERM_EXPR <vect1, vect2,
4886 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4887 					 ...}>  */
4888 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4889 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4890 						 vect2, perm_mask_low);
4891 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4892 		(*result_chain)[2*j+1] = low;
4893 	      }
4894 	    memcpy (dr_chain.address (), result_chain->address (),
4895 		    length * sizeof (tree));
4896 	  }
4897     }
4898 }
4899 
4900 /* Function vect_setup_realignment
4901 
4902    This function is called when vectorizing an unaligned load using
4903    the dr_explicit_realign[_optimized] scheme.
4904    This function generates the following code at the loop prolog:
4905 
4906       p = initial_addr;
4907    x  msq_init = *(floor(p));   # prolog load
4908       realignment_token = call target_builtin;
4909     loop:
4910    x  msq = phi (msq_init, ---)
4911 
4912    The stmts marked with x are generated only for the case of
4913    dr_explicit_realign_optimized.
4914 
4915    The code above sets up a new (vector) pointer, pointing to the first
4916    location accessed by STMT, and a "floor-aligned" load using that pointer.
4917    It also generates code to compute the "realignment-token" (if the relevant
4918    target hook was defined), and creates a phi-node at the loop-header bb
4919    whose arguments are the result of the prolog-load (created by this
4920    function) and the result of a load that takes place in the loop (to be
4921    created by the caller to this function).
4922 
4923    For the case of dr_explicit_realign_optimized:
4924    The caller to this function uses the phi-result (msq) to create the
4925    realignment code inside the loop, and sets up the missing phi argument,
4926    as follows:
4927     loop:
4928       msq = phi (msq_init, lsq)
4929       lsq = *(floor(p'));        # load in loop
4930       result = realign_load (msq, lsq, realignment_token);
4931 
4932    For the case of dr_explicit_realign:
4933     loop:
4934       msq = *(floor(p)); 	# load in loop
4935       p' = p + (VS-1);
4936       lsq = *(floor(p'));	# load in loop
4937       result = realign_load (msq, lsq, realignment_token);
4938 
4939    Input:
4940    STMT - (scalar) load stmt to be vectorized. This load accesses
4941           a memory location that may be unaligned.
4942    BSI - place where new code is to be inserted.
4943    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4944 			      is used.
4945 
4946    Output:
4947    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4948                        target hook, if defined.
4949    Return value - the result of the loop-header phi node.  */
4950 
4951 tree
4952 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4953                         tree *realignment_token,
4954 			enum dr_alignment_support alignment_support_scheme,
4955 			tree init_addr,
4956 			struct loop **at_loop)
4957 {
4958   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4959   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4960   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4961   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4962   struct loop *loop = NULL;
4963   edge pe = NULL;
4964   tree scalar_dest = gimple_assign_lhs (stmt);
4965   tree vec_dest;
4966   gimple *inc;
4967   tree ptr;
4968   tree data_ref;
4969   basic_block new_bb;
4970   tree msq_init = NULL_TREE;
4971   tree new_temp;
4972   gphi *phi_stmt;
4973   tree msq = NULL_TREE;
4974   gimple_seq stmts = NULL;
4975   bool inv_p;
4976   bool compute_in_loop = false;
4977   bool nested_in_vect_loop = false;
4978   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4979   struct loop *loop_for_initial_load = NULL;
4980 
4981   if (loop_vinfo)
4982     {
4983       loop = LOOP_VINFO_LOOP (loop_vinfo);
4984       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4985     }
4986 
4987   gcc_assert (alignment_support_scheme == dr_explicit_realign
4988 	      || alignment_support_scheme == dr_explicit_realign_optimized);
4989 
4990   /* We need to generate three things:
4991      1. the misalignment computation
4992      2. the extra vector load (for the optimized realignment scheme).
4993      3. the phi node for the two vectors from which the realignment is
4994       done (for the optimized realignment scheme).  */
4995 
4996   /* 1. Determine where to generate the misalignment computation.
4997 
4998      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4999      calculation will be generated by this function, outside the loop (in the
5000      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5001      caller, inside the loop.
5002 
5003      Background: If the misalignment remains fixed throughout the iterations of
5004      the loop, then both realignment schemes are applicable, and also the
5005      misalignment computation can be done outside LOOP.  This is because we are
5006      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5007      are a multiple of VS (the Vector Size), and therefore the misalignment in
5008      different vectorized LOOP iterations is always the same.
5009      The problem arises only if the memory access is in an inner-loop nested
5010      inside LOOP, which is now being vectorized using outer-loop vectorization.
5011      This is the only case when the misalignment of the memory access may not
5012      remain fixed throughout the iterations of the inner-loop (as explained in
5013      detail in vect_supportable_dr_alignment).  In this case, not only is the
5014      optimized realignment scheme not applicable, but also the misalignment
5015      computation (and generation of the realignment token that is passed to
5016      REALIGN_LOAD) have to be done inside the loop.
5017 
5018      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5019      or not, which in turn determines if the misalignment is computed inside
5020      the inner-loop, or outside LOOP.  */
5021 
5022   if (init_addr != NULL_TREE || !loop_vinfo)
5023     {
5024       compute_in_loop = true;
5025       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5026     }
5027 
5028 
5029   /* 2. Determine where to generate the extra vector load.
5030 
5031      For the optimized realignment scheme, instead of generating two vector
5032      loads in each iteration, we generate a single extra vector load in the
5033      preheader of the loop, and in each iteration reuse the result of the
5034      vector load from the previous iteration.  In case the memory access is in
5035      an inner-loop nested inside LOOP, which is now being vectorized using
5036      outer-loop vectorization, we need to determine whether this initial vector
5037      load should be generated at the preheader of the inner-loop, or can be
5038      generated at the preheader of LOOP.  If the memory access has no evolution
5039      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5040      to be generated inside LOOP (in the preheader of the inner-loop).  */
5041 
5042   if (nested_in_vect_loop)
5043     {
5044       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5045       bool invariant_in_outerloop =
5046             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5047       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5048     }
5049   else
5050     loop_for_initial_load = loop;
5051   if (at_loop)
5052     *at_loop = loop_for_initial_load;
5053 
5054   if (loop_for_initial_load)
5055     pe = loop_preheader_edge (loop_for_initial_load);
5056 
5057   /* 3. For the case of the optimized realignment, create the first vector
5058       load at the loop preheader.  */
5059 
5060   if (alignment_support_scheme == dr_explicit_realign_optimized)
5061     {
5062       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5063       gassign *new_stmt;
5064 
5065       gcc_assert (!compute_in_loop);
5066       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5067       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5068 				      NULL_TREE, &init_addr, NULL, &inc,
5069 				      true, &inv_p);
5070       if (TREE_CODE (ptr) == SSA_NAME)
5071 	new_temp = copy_ssa_name (ptr);
5072       else
5073 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5074       new_stmt = gimple_build_assign
5075 		   (new_temp, BIT_AND_EXPR, ptr,
5076 		    build_int_cst (TREE_TYPE (ptr),
5077 				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5078       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5079       gcc_assert (!new_bb);
5080       data_ref
5081 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5082 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5083       new_stmt = gimple_build_assign (vec_dest, data_ref);
5084       new_temp = make_ssa_name (vec_dest, new_stmt);
5085       gimple_assign_set_lhs (new_stmt, new_temp);
5086       if (pe)
5087         {
5088           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5089           gcc_assert (!new_bb);
5090         }
5091       else
5092          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5093 
5094       msq_init = gimple_assign_lhs (new_stmt);
5095     }
5096 
5097   /* 4. Create realignment token using a target builtin, if available.
5098       It is done either inside the containing loop, or before LOOP (as
5099       determined above).  */
5100 
5101   if (targetm.vectorize.builtin_mask_for_load)
5102     {
5103       gcall *new_stmt;
5104       tree builtin_decl;
5105 
5106       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5107       if (!init_addr)
5108 	{
5109 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5110 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5111 							NULL_TREE, loop);
5112           if (loop)
5113             {
5114    	      pe = loop_preheader_edge (loop);
5115 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5116 	      gcc_assert (!new_bb);
5117             }
5118           else
5119              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5120 	}
5121 
5122       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5123       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5124       vec_dest =
5125 	vect_create_destination_var (scalar_dest,
5126 				     gimple_call_return_type (new_stmt));
5127       new_temp = make_ssa_name (vec_dest, new_stmt);
5128       gimple_call_set_lhs (new_stmt, new_temp);
5129 
5130       if (compute_in_loop)
5131 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5132       else
5133 	{
5134 	  /* Generate the misalignment computation outside LOOP.  */
5135 	  pe = loop_preheader_edge (loop);
5136 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5137 	  gcc_assert (!new_bb);
5138 	}
5139 
5140       *realignment_token = gimple_call_lhs (new_stmt);
5141 
5142       /* The result of the CALL_EXPR to this builtin is determined from
5143          the value of the parameter and no global variables are touched
5144          which makes the builtin a "const" function.  Requiring the
5145          builtin to have the "const" attribute makes it unnecessary
5146          to call mark_call_clobbered.  */
5147       gcc_assert (TREE_READONLY (builtin_decl));
5148     }
5149 
5150   if (alignment_support_scheme == dr_explicit_realign)
5151     return msq;
5152 
5153   gcc_assert (!compute_in_loop);
5154   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5155 
5156 
5157   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5158 
5159   pe = loop_preheader_edge (containing_loop);
5160   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5161   msq = make_ssa_name (vec_dest);
5162   phi_stmt = create_phi_node (msq, containing_loop->header);
5163   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5164 
5165   return msq;
5166 }
5167 
5168 
5169 /* Function vect_grouped_load_supported.
5170 
5171    Returns TRUE if even and odd permutations are supported,
5172    and FALSE otherwise.  */
5173 
5174 bool
5175 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5176 {
5177   machine_mode mode = TYPE_MODE (vectype);
5178 
5179   /* vect_permute_load_chain requires the group size to be equal to 3 or
5180      be a power of two.  */
5181   if (count != 3 && exact_log2 (count) == -1)
5182     {
5183       if (dump_enabled_p ())
5184 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185 			 "the size of the group of accesses"
5186 			 " is not a power of 2 or not equal to 3\n");
5187       return false;
5188     }
5189 
5190   /* Check that the permutation is supported.  */
5191   if (VECTOR_MODE_P (mode))
5192     {
5193       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5194       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5195 
5196       if (count == 3)
5197 	{
5198 	  unsigned int k;
5199 	  for (k = 0; k < 3; k++)
5200 	    {
5201 	      for (i = 0; i < nelt; i++)
5202 		if (3 * i + k < 2 * nelt)
5203 		  sel[i] = 3 * i + k;
5204 		else
5205 		  sel[i] = 0;
5206 	      if (!can_vec_perm_p (mode, false, sel))
5207 		{
5208 		  if (dump_enabled_p ())
5209 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5210 				     "shuffle of 3 loads is not supported by"
5211 				     " target\n");
5212 		  return false;
5213 		}
5214 	      for (i = 0, j = 0; i < nelt; i++)
5215 		if (3 * i + k < 2 * nelt)
5216 		  sel[i] = i;
5217 		else
5218 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5219 	      if (!can_vec_perm_p (mode, false, sel))
5220 		{
5221 		  if (dump_enabled_p ())
5222 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5223 				     "shuffle of 3 loads is not supported by"
5224 				     " target\n");
5225 		  return false;
5226 		}
5227 	    }
5228 	  return true;
5229 	}
5230       else
5231 	{
5232 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5233 	  gcc_assert (exact_log2 (count) != -1);
5234 	  for (i = 0; i < nelt; i++)
5235 	    sel[i] = i * 2;
5236 	  if (can_vec_perm_p (mode, false, sel))
5237 	    {
5238 	      for (i = 0; i < nelt; i++)
5239 		sel[i] = i * 2 + 1;
5240 	      if (can_vec_perm_p (mode, false, sel))
5241 		return true;
5242 	    }
5243         }
5244     }
5245 
5246   if (dump_enabled_p ())
5247     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5248 		     "extract even/odd not supported by target\n");
5249   return false;
5250 }
5251 
5252 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5253    type VECTYPE.  */
5254 
5255 bool
5256 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5257 {
5258   return vect_lanes_optab_supported_p ("vec_load_lanes",
5259 				       vec_load_lanes_optab,
5260 				       vectype, count);
5261 }
5262 
5263 /* Function vect_permute_load_chain.
5264 
5265    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5266    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5267    the input data correctly.  Return the final references for loads in
5268    RESULT_CHAIN.
5269 
5270    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5271    The input is 4 vectors each containing 8 elements. We assign a number to each
5272    element, the input sequence is:
5273 
5274    1st vec:   0  1  2  3  4  5  6  7
5275    2nd vec:   8  9 10 11 12 13 14 15
5276    3rd vec:  16 17 18 19 20 21 22 23
5277    4th vec:  24 25 26 27 28 29 30 31
5278 
5279    The output sequence should be:
5280 
5281    1st vec:  0 4  8 12 16 20 24 28
5282    2nd vec:  1 5  9 13 17 21 25 29
5283    3rd vec:  2 6 10 14 18 22 26 30
5284    4th vec:  3 7 11 15 19 23 27 31
5285 
5286    i.e., the first output vector should contain the first elements of each
5287    interleaving group, etc.
5288 
5289    We use extract_even/odd instructions to create such output.  The input of
5290    each extract_even/odd operation is two vectors
5291    1st vec    2nd vec
5292    0 1 2 3    4 5 6 7
5293 
5294    and the output is the vector of extracted even/odd elements.  The output of
5295    extract_even will be:   0 2 4 6
5296    and of extract_odd:     1 3 5 7
5297 
5298 
5299    The permutation is done in log LENGTH stages.  In each stage extract_even
5300    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5301    their order.  In our example,
5302 
5303    E1: extract_even (1st vec, 2nd vec)
5304    E2: extract_odd (1st vec, 2nd vec)
5305    E3: extract_even (3rd vec, 4th vec)
5306    E4: extract_odd (3rd vec, 4th vec)
5307 
5308    The output for the first stage will be:
5309 
5310    E1:  0  2  4  6  8 10 12 14
5311    E2:  1  3  5  7  9 11 13 15
5312    E3: 16 18 20 22 24 26 28 30
5313    E4: 17 19 21 23 25 27 29 31
5314 
5315    In order to proceed and create the correct sequence for the next stage (or
5316    for the correct output, if the second stage is the last one, as in our
5317    example), we first put the output of extract_even operation and then the
5318    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5319    The input for the second stage is:
5320 
5321    1st vec (E1):  0  2  4  6  8 10 12 14
5322    2nd vec (E3): 16 18 20 22 24 26 28 30
5323    3rd vec (E2):  1  3  5  7  9 11 13 15
5324    4th vec (E4): 17 19 21 23 25 27 29 31
5325 
5326    The output of the second stage:
5327 
5328    E1: 0 4  8 12 16 20 24 28
5329    E2: 2 6 10 14 18 22 26 30
5330    E3: 1 5  9 13 17 21 25 29
5331    E4: 3 7 11 15 19 23 27 31
5332 
5333    And RESULT_CHAIN after reordering:
5334 
5335    1st vec (E1):  0 4  8 12 16 20 24 28
5336    2nd vec (E3):  1 5  9 13 17 21 25 29
5337    3rd vec (E2):  2 6 10 14 18 22 26 30
5338    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5339 
5340 static void
5341 vect_permute_load_chain (vec<tree> dr_chain,
5342 			 unsigned int length,
5343 			 gimple *stmt,
5344 			 gimple_stmt_iterator *gsi,
5345 			 vec<tree> *result_chain)
5346 {
5347   tree data_ref, first_vect, second_vect;
5348   tree perm_mask_even, perm_mask_odd;
5349   tree perm3_mask_low, perm3_mask_high;
5350   gimple *perm_stmt;
5351   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5352   unsigned int i, j, log_length = exact_log2 (length);
5353   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5354   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5355 
5356   result_chain->quick_grow (length);
5357   memcpy (result_chain->address (), dr_chain.address (),
5358 	  length * sizeof (tree));
5359 
5360   if (length == 3)
5361     {
5362       unsigned int k;
5363 
5364       for (k = 0; k < 3; k++)
5365 	{
5366 	  for (i = 0; i < nelt; i++)
5367 	    if (3 * i + k < 2 * nelt)
5368 	      sel[i] = 3 * i + k;
5369 	    else
5370 	      sel[i] = 0;
5371 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5372 
5373 	  for (i = 0, j = 0; i < nelt; i++)
5374 	    if (3 * i + k < 2 * nelt)
5375 	      sel[i] = i;
5376 	    else
5377 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5378 
5379 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5380 
5381 	  first_vect = dr_chain[0];
5382 	  second_vect = dr_chain[1];
5383 
5384 	  /* Create interleaving stmt (low part of):
5385 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5386 							     ...}>  */
5387 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5388 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5389 					   second_vect, perm3_mask_low);
5390 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5391 
5392 	  /* Create interleaving stmt (high part of):
5393 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5394 							      ...}>  */
5395 	  first_vect = data_ref;
5396 	  second_vect = dr_chain[2];
5397 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5398 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5399 					   second_vect, perm3_mask_high);
5400 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5401 	  (*result_chain)[k] = data_ref;
5402 	}
5403     }
5404   else
5405     {
5406       /* If length is not equal to 3 then only power of 2 is supported.  */
5407       gcc_assert (exact_log2 (length) != -1);
5408 
5409       for (i = 0; i < nelt; ++i)
5410 	sel[i] = i * 2;
5411       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5412 
5413       for (i = 0; i < nelt; ++i)
5414 	sel[i] = i * 2 + 1;
5415       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5416 
5417       for (i = 0; i < log_length; i++)
5418 	{
5419 	  for (j = 0; j < length; j += 2)
5420 	    {
5421 	      first_vect = dr_chain[j];
5422 	      second_vect = dr_chain[j+1];
5423 
5424 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5425 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5426 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5427 					       first_vect, second_vect,
5428 					       perm_mask_even);
5429 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5430 	      (*result_chain)[j/2] = data_ref;
5431 
5432 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5433 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5434 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435 					       first_vect, second_vect,
5436 					       perm_mask_odd);
5437 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438 	      (*result_chain)[j/2+length/2] = data_ref;
5439 	    }
5440 	  memcpy (dr_chain.address (), result_chain->address (),
5441 		  length * sizeof (tree));
5442 	}
5443     }
5444 }
5445 
5446 /* Function vect_shift_permute_load_chain.
5447 
5448    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5449    sequence of stmts to reorder the input data accordingly.
5450    Return the final references for loads in RESULT_CHAIN.
5451    Return true if successed, false otherwise.
5452 
5453    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5454    The input is 3 vectors each containing 8 elements.  We assign a
5455    number to each element, the input sequence is:
5456 
5457    1st vec:   0  1  2  3  4  5  6  7
5458    2nd vec:   8  9 10 11 12 13 14 15
5459    3rd vec:  16 17 18 19 20 21 22 23
5460 
5461    The output sequence should be:
5462 
5463    1st vec:  0 3 6  9 12 15 18 21
5464    2nd vec:  1 4 7 10 13 16 19 22
5465    3rd vec:  2 5 8 11 14 17 20 23
5466 
5467    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5468 
5469    First we shuffle all 3 vectors to get correct elements order:
5470 
5471    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5472    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5473    3rd vec:  (16 19 22) (17 20 23) (18 21)
5474 
5475    Next we unite and shift vector 3 times:
5476 
5477    1st step:
5478      shift right by 6 the concatenation of:
5479      "1st vec" and  "2nd vec"
5480        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5481      "2nd vec" and  "3rd vec"
5482        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5483      "3rd vec" and  "1st vec"
5484        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5485 			     | New vectors                   |
5486 
5487      So that now new vectors are:
5488 
5489      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5490      2nd vec:  (10 13) (16 19 22) (17 20 23)
5491      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5492 
5493    2nd step:
5494      shift right by 5 the concatenation of:
5495      "1st vec" and  "3rd vec"
5496        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5497      "2nd vec" and  "1st vec"
5498        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5499      "3rd vec" and  "2nd vec"
5500        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5501 			  | New vectors                   |
5502 
5503      So that now new vectors are:
5504 
5505      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5506      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5507      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5508 
5509    3rd step:
5510      shift right by 5 the concatenation of:
5511      "1st vec" and  "1st vec"
5512        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5513      shift right by 3 the concatenation of:
5514      "2nd vec" and  "2nd vec"
5515                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5516 			  | New vectors                   |
5517 
5518      So that now all vectors are READY:
5519      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5520      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5521      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5522 
5523    This algorithm is faster than one in vect_permute_load_chain if:
5524      1.  "shift of a concatination" is faster than general permutation.
5525 	 This is usually so.
5526      2.  The TARGET machine can't execute vector instructions in parallel.
5527 	 This is because each step of the algorithm depends on previous.
5528 	 The algorithm in vect_permute_load_chain is much more parallel.
5529 
5530    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5531 */
5532 
5533 static bool
5534 vect_shift_permute_load_chain (vec<tree> dr_chain,
5535 			       unsigned int length,
5536 			       gimple *stmt,
5537 			       gimple_stmt_iterator *gsi,
5538 			       vec<tree> *result_chain)
5539 {
5540   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5541   tree perm2_mask1, perm2_mask2, perm3_mask;
5542   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5543   gimple *perm_stmt;
5544 
5545   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5546   unsigned int i;
5547   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5548   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5549   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5550   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5551 
5552   result_chain->quick_grow (length);
5553   memcpy (result_chain->address (), dr_chain.address (),
5554 	  length * sizeof (tree));
5555 
5556   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5557     {
5558       unsigned int j, log_length = exact_log2 (length);
5559       for (i = 0; i < nelt / 2; ++i)
5560 	sel[i] = i * 2;
5561       for (i = 0; i < nelt / 2; ++i)
5562 	sel[nelt / 2 + i] = i * 2 + 1;
5563       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5564 	{
5565 	  if (dump_enabled_p ())
5566 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5567 			     "shuffle of 2 fields structure is not \
5568 			      supported by target\n");
5569 	  return false;
5570 	}
5571       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5572 
5573       for (i = 0; i < nelt / 2; ++i)
5574 	sel[i] = i * 2 + 1;
5575       for (i = 0; i < nelt / 2; ++i)
5576 	sel[nelt / 2 + i] = i * 2;
5577       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5578 	{
5579 	  if (dump_enabled_p ())
5580 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5581 			     "shuffle of 2 fields structure is not \
5582 			      supported by target\n");
5583 	  return false;
5584 	}
5585       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5586 
5587       /* Generating permutation constant to shift all elements.
5588 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5589       for (i = 0; i < nelt; i++)
5590 	sel[i] = nelt / 2 + i;
5591       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5592 	{
5593 	  if (dump_enabled_p ())
5594 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595 			     "shift permutation is not supported by target\n");
5596 	  return false;
5597 	}
5598       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5599 
5600       /* Generating permutation constant to select vector from 2.
5601 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5602       for (i = 0; i < nelt / 2; i++)
5603 	sel[i] = i;
5604       for (i = nelt / 2; i < nelt; i++)
5605 	sel[i] = nelt + i;
5606       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5607 	{
5608 	  if (dump_enabled_p ())
5609 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5610 			     "select is not supported by target\n");
5611 	  return false;
5612 	}
5613       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5614 
5615       for (i = 0; i < log_length; i++)
5616 	{
5617 	  for (j = 0; j < length; j += 2)
5618 	    {
5619 	      first_vect = dr_chain[j];
5620 	      second_vect = dr_chain[j + 1];
5621 
5622 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5623 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624 					       first_vect, first_vect,
5625 					       perm2_mask1);
5626 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5627 	      vect[0] = data_ref;
5628 
5629 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5630 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5631 					       second_vect, second_vect,
5632 					       perm2_mask2);
5633 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5634 	      vect[1] = data_ref;
5635 
5636 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5637 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5638 					       vect[0], vect[1], shift1_mask);
5639 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5640 	      (*result_chain)[j/2 + length/2] = data_ref;
5641 
5642 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5643 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5644 					       vect[0], vect[1], select_mask);
5645 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5646 	      (*result_chain)[j/2] = data_ref;
5647 	    }
5648 	  memcpy (dr_chain.address (), result_chain->address (),
5649 		  length * sizeof (tree));
5650 	}
5651       return true;
5652     }
5653   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5654     {
5655       unsigned int k = 0, l = 0;
5656 
5657       /* Generating permutation constant to get all elements in rigth order.
5658 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5659       for (i = 0; i < nelt; i++)
5660 	{
5661 	  if (3 * k + (l % 3) >= nelt)
5662 	    {
5663 	      k = 0;
5664 	      l += (3 - (nelt % 3));
5665 	    }
5666 	  sel[i] = 3 * k + (l % 3);
5667 	  k++;
5668 	}
5669       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5670 	{
5671 	  if (dump_enabled_p ())
5672 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5673 			     "shuffle of 3 fields structure is not \
5674 			      supported by target\n");
5675 	  return false;
5676 	}
5677       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5678 
5679       /* Generating permutation constant to shift all elements.
5680 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5681       for (i = 0; i < nelt; i++)
5682 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5683       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5684 	{
5685 	  if (dump_enabled_p ())
5686 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5687 			     "shift permutation is not supported by target\n");
5688 	  return false;
5689 	}
5690       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5691 
5692       /* Generating permutation constant to shift all elements.
5693 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5694       for (i = 0; i < nelt; i++)
5695 	sel[i] = 2 * (nelt / 3) + 1 + i;
5696       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5697 	{
5698 	  if (dump_enabled_p ())
5699 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5700 			     "shift permutation is not supported by target\n");
5701 	  return false;
5702 	}
5703       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5704 
5705       /* Generating permutation constant to shift all elements.
5706 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5707       for (i = 0; i < nelt; i++)
5708 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5709       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5710 	{
5711 	  if (dump_enabled_p ())
5712 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713 			     "shift permutation is not supported by target\n");
5714 	  return false;
5715 	}
5716       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5717 
5718       /* Generating permutation constant to shift all elements.
5719 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5720       for (i = 0; i < nelt; i++)
5721 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5722       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5723 	{
5724 	  if (dump_enabled_p ())
5725 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726 			     "shift permutation is not supported by target\n");
5727 	  return false;
5728 	}
5729       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5730 
5731       for (k = 0; k < 3; k++)
5732 	{
5733 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5734 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5735 					   dr_chain[k], dr_chain[k],
5736 					   perm3_mask);
5737 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5738 	  vect[k] = data_ref;
5739 	}
5740 
5741       for (k = 0; k < 3; k++)
5742 	{
5743 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5744 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5745 					   vect[k % 3], vect[(k + 1) % 3],
5746 					   shift1_mask);
5747 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5748 	  vect_shift[k] = data_ref;
5749 	}
5750 
5751       for (k = 0; k < 3; k++)
5752 	{
5753 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5754 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5755 					   vect_shift[(4 - k) % 3],
5756 					   vect_shift[(3 - k) % 3],
5757 					   shift2_mask);
5758 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5759 	  vect[k] = data_ref;
5760 	}
5761 
5762       (*result_chain)[3 - (nelt % 3)] = vect[2];
5763 
5764       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5765       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5766 				       vect[0], shift3_mask);
5767       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5768       (*result_chain)[nelt % 3] = data_ref;
5769 
5770       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5771       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5772 				       vect[1], shift4_mask);
5773       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5774       (*result_chain)[0] = data_ref;
5775       return true;
5776     }
5777   return false;
5778 }
5779 
5780 /* Function vect_transform_grouped_load.
5781 
5782    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5783    to perform their permutation and ascribe the result vectorized statements to
5784    the scalar statements.
5785 */
5786 
5787 void
5788 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5789 			     gimple_stmt_iterator *gsi)
5790 {
5791   machine_mode mode;
5792   vec<tree> result_chain = vNULL;
5793 
5794   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5795      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5796      vectors, that are ready for vector computation.  */
5797   result_chain.create (size);
5798 
5799   /* If reassociation width for vector type is 2 or greater target machine can
5800      execute 2 or more vector instructions in parallel.  Otherwise try to
5801      get chain for loads group using vect_shift_permute_load_chain.  */
5802   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5803   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5804       || exact_log2 (size) != -1
5805       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5806 					 gsi, &result_chain))
5807     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5808   vect_record_grouped_load_vectors (stmt, result_chain);
5809   result_chain.release ();
5810 }
5811 
5812 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5813    generated as part of the vectorization of STMT.  Assign the statement
5814    for each vector to the associated scalar statement.  */
5815 
5816 void
5817 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5818 {
5819   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5820   gimple *next_stmt, *new_stmt;
5821   unsigned int i, gap_count;
5822   tree tmp_data_ref;
5823 
5824   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5825      Since we scan the chain starting from it's first node, their order
5826      corresponds the order of data-refs in RESULT_CHAIN.  */
5827   next_stmt = first_stmt;
5828   gap_count = 1;
5829   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5830     {
5831       if (!next_stmt)
5832 	break;
5833 
5834       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5835        code elimination pass later.  No need to check for the first stmt in
5836        the group, since it always exists.
5837        GROUP_GAP is the number of steps in elements from the previous
5838        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5839        correspond to the gaps.  */
5840       if (next_stmt != first_stmt
5841           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5842       {
5843         gap_count++;
5844         continue;
5845       }
5846 
5847       while (next_stmt)
5848         {
5849 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5850 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5851 	     copies, and we put the new vector statement in the first available
5852 	     RELATED_STMT.  */
5853 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5854 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5855 	  else
5856             {
5857               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5858                 {
5859 		  gimple *prev_stmt =
5860 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5861 		  gimple *rel_stmt =
5862 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5863 	          while (rel_stmt)
5864 		    {
5865 		      prev_stmt = rel_stmt;
5866 		      rel_stmt =
5867                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5868 		    }
5869 
5870   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5871                     new_stmt;
5872                 }
5873             }
5874 
5875 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5876 	  gap_count = 1;
5877 	  /* If NEXT_STMT accesses the same DR as the previous statement,
5878 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
5879 	     get the next data-ref from RESULT_CHAIN.  */
5880 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5881 	    break;
5882         }
5883     }
5884 }
5885 
5886 /* Function vect_force_dr_alignment_p.
5887 
5888    Returns whether the alignment of a DECL can be forced to be aligned
5889    on ALIGNMENT bit boundary.  */
5890 
5891 bool
5892 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5893 {
5894   if (TREE_CODE (decl) != VAR_DECL)
5895     return false;
5896 
5897   if (decl_in_symtab_p (decl)
5898       && !symtab_node::get (decl)->can_increase_alignment_p ())
5899     return false;
5900 
5901   if (TREE_STATIC (decl))
5902     return (alignment <= MAX_OFILE_ALIGNMENT);
5903   else
5904     return (alignment <= MAX_STACK_ALIGNMENT);
5905 }
5906 
5907 
5908 /* Return whether the data reference DR is supported with respect to its
5909    alignment.
5910    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5911    it is aligned, i.e., check if it is possible to vectorize it with different
5912    alignment.  */
5913 
5914 enum dr_alignment_support
5915 vect_supportable_dr_alignment (struct data_reference *dr,
5916                                bool check_aligned_accesses)
5917 {
5918   gimple *stmt = DR_STMT (dr);
5919   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5920   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5921   machine_mode mode = TYPE_MODE (vectype);
5922   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5923   struct loop *vect_loop = NULL;
5924   bool nested_in_vect_loop = false;
5925 
5926   if (aligned_access_p (dr) && !check_aligned_accesses)
5927     return dr_aligned;
5928 
5929   /* For now assume all conditional loads/stores support unaligned
5930      access without any special code.  */
5931   if (is_gimple_call (stmt)
5932       && gimple_call_internal_p (stmt)
5933       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5934 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5935     return dr_unaligned_supported;
5936 
5937   if (loop_vinfo)
5938     {
5939       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5940       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5941     }
5942 
5943   /* Possibly unaligned access.  */
5944 
5945   /* We can choose between using the implicit realignment scheme (generating
5946      a misaligned_move stmt) and the explicit realignment scheme (generating
5947      aligned loads with a REALIGN_LOAD).  There are two variants to the
5948      explicit realignment scheme: optimized, and unoptimized.
5949      We can optimize the realignment only if the step between consecutive
5950      vector loads is equal to the vector size.  Since the vector memory
5951      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5952      is guaranteed that the misalignment amount remains the same throughout the
5953      execution of the vectorized loop.  Therefore, we can create the
5954      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5955      at the loop preheader.
5956 
5957      However, in the case of outer-loop vectorization, when vectorizing a
5958      memory access in the inner-loop nested within the LOOP that is now being
5959      vectorized, while it is guaranteed that the misalignment of the
5960      vectorized memory access will remain the same in different outer-loop
5961      iterations, it is *not* guaranteed that is will remain the same throughout
5962      the execution of the inner-loop.  This is because the inner-loop advances
5963      with the original scalar step (and not in steps of VS).  If the inner-loop
5964      step happens to be a multiple of VS, then the misalignment remains fixed
5965      and we can use the optimized realignment scheme.  For example:
5966 
5967       for (i=0; i<N; i++)
5968         for (j=0; j<M; j++)
5969           s += a[i+j];
5970 
5971      When vectorizing the i-loop in the above example, the step between
5972      consecutive vector loads is 1, and so the misalignment does not remain
5973      fixed across the execution of the inner-loop, and the realignment cannot
5974      be optimized (as illustrated in the following pseudo vectorized loop):
5975 
5976       for (i=0; i<N; i+=4)
5977         for (j=0; j<M; j++){
5978           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5979                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5980                          // (assuming that we start from an aligned address).
5981           }
5982 
5983      We therefore have to use the unoptimized realignment scheme:
5984 
5985       for (i=0; i<N; i+=4)
5986           for (j=k; j<M; j+=4)
5987           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5988                            // that the misalignment of the initial address is
5989                            // 0).
5990 
5991      The loop can then be vectorized as follows:
5992 
5993       for (k=0; k<4; k++){
5994         rt = get_realignment_token (&vp[k]);
5995         for (i=0; i<N; i+=4){
5996           v1 = vp[i+k];
5997           for (j=k; j<M; j+=4){
5998             v2 = vp[i+j+VS-1];
5999             va = REALIGN_LOAD <v1,v2,rt>;
6000             vs += va;
6001             v1 = v2;
6002           }
6003         }
6004     } */
6005 
6006   if (DR_IS_READ (dr))
6007     {
6008       bool is_packed = false;
6009       tree type = (TREE_TYPE (DR_REF (dr)));
6010 
6011       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6012 	  && (!targetm.vectorize.builtin_mask_for_load
6013 	      || targetm.vectorize.builtin_mask_for_load ()))
6014 	{
6015 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6016 
6017 	  /* If we are doing SLP then the accesses need not have the
6018 	     same alignment, instead it depends on the SLP group size.  */
6019 	  if (loop_vinfo
6020 	      && STMT_SLP_TYPE (stmt_info)
6021 	      && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6022 		  * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6023 		  % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6024 	    ;
6025 	  else if (!loop_vinfo
6026 		   || (nested_in_vect_loop
6027 		       && (TREE_INT_CST_LOW (DR_STEP (dr))
6028 			   != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6029 	    return dr_explicit_realign;
6030 	  else
6031 	    return dr_explicit_realign_optimized;
6032 	}
6033       if (!known_alignment_for_access_p (dr))
6034 	is_packed = not_size_aligned (DR_REF (dr));
6035 
6036       if (targetm.vectorize.support_vector_misalignment
6037 	    (mode, type, DR_MISALIGNMENT (dr), is_packed))
6038 	/* Can't software pipeline the loads, but can at least do them.  */
6039 	return dr_unaligned_supported;
6040     }
6041   else
6042     {
6043       bool is_packed = false;
6044       tree type = (TREE_TYPE (DR_REF (dr)));
6045 
6046       if (!known_alignment_for_access_p (dr))
6047 	is_packed = not_size_aligned (DR_REF (dr));
6048 
6049      if (targetm.vectorize.support_vector_misalignment
6050 	   (mode, type, DR_MISALIGNMENT (dr), is_packed))
6051        return dr_unaligned_supported;
6052     }
6053 
6054   /* Unsupported.  */
6055   return dr_unaligned_unsupported;
6056 }
6057