xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-data-refs.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tm_p.h"
40 #include "target.h"
41 #include "predict.h"
42 #include "hard-reg-set.h"
43 #include "function.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "basic-block.h"
47 #include "gimple-pretty-print.h"
48 #include "tree-ssa-alias.h"
49 #include "internal-fn.h"
50 #include "tree-eh.h"
51 #include "gimple-expr.h"
52 #include "is-a.h"
53 #include "gimple.h"
54 #include "gimplify.h"
55 #include "gimple-iterator.h"
56 #include "gimplify-me.h"
57 #include "gimple-ssa.h"
58 #include "tree-phinodes.h"
59 #include "ssa-iterators.h"
60 #include "stringpool.h"
61 #include "tree-ssanames.h"
62 #include "tree-ssa-loop-ivopts.h"
63 #include "tree-ssa-loop-manip.h"
64 #include "tree-ssa-loop.h"
65 #include "cfgloop.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "diagnostic-core.h"
70 #include "hash-map.h"
71 #include "plugin-api.h"
72 #include "ipa-ref.h"
73 #include "cgraph.h"
74 /* Need to include rtl.h, expr.h, etc. for optabs.  */
75 #include "hashtab.h"
76 #include "rtl.h"
77 #include "flags.h"
78 #include "statistics.h"
79 #include "real.h"
80 #include "fixed-value.h"
81 #include "insn-config.h"
82 #include "expmed.h"
83 #include "dojump.h"
84 #include "explow.h"
85 #include "calls.h"
86 #include "emit-rtl.h"
87 #include "varasm.h"
88 #include "stmt.h"
89 #include "expr.h"
90 #include "insn-codes.h"
91 #include "optabs.h"
92 #include "builtins.h"
93 
94 /* Return true if load- or store-lanes optab OPTAB is implemented for
95    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
96 
97 static bool
98 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
99 			      tree vectype, unsigned HOST_WIDE_INT count)
100 {
101   machine_mode mode, array_mode;
102   bool limit_p;
103 
104   mode = TYPE_MODE (vectype);
105   limit_p = !targetm.array_mode_supported_p (mode, count);
106   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
107 			      MODE_INT, limit_p);
108 
109   if (array_mode == BLKmode)
110     {
111       if (dump_enabled_p ())
112 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
113                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
114                          GET_MODE_NAME (mode), count);
115       return false;
116     }
117 
118   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
119     {
120       if (dump_enabled_p ())
121 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
122                          "cannot use %s<%s><%s>\n", name,
123                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
124       return false;
125     }
126 
127   if (dump_enabled_p ())
128     dump_printf_loc (MSG_NOTE, vect_location,
129                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
130                      GET_MODE_NAME (mode));
131 
132   return true;
133 }
134 
135 
136 /* Return the smallest scalar part of STMT.
137    This is used to determine the vectype of the stmt.  We generally set the
138    vectype according to the type of the result (lhs).  For stmts whose
139    result-type is different than the type of the arguments (e.g., demotion,
140    promotion), vectype will be reset appropriately (later).  Note that we have
141    to visit the smallest datatype in this function, because that determines the
142    VF.  If the smallest datatype in the loop is present only as the rhs of a
143    promotion operation - we'd miss it.
144    Such a case, where a variable of this datatype does not appear in the lhs
145    anywhere in the loop, can only occur if it's an invariant: e.g.:
146    'int_x = (int) short_inv', which we'd expect to have been optimized away by
147    invariant motion.  However, we cannot rely on invariant motion to always
148    take invariants out of the loop, and so in the case of promotion we also
149    have to check the rhs.
150    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
151    types.  */
152 
153 tree
154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
155                                HOST_WIDE_INT *rhs_size_unit)
156 {
157   tree scalar_type = gimple_expr_type (stmt);
158   HOST_WIDE_INT lhs, rhs;
159 
160   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
161 
162   if (is_gimple_assign (stmt)
163       && (gimple_assign_cast_p (stmt)
164           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
165           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
166           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
167     {
168       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
169 
170       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
171       if (rhs < lhs)
172         scalar_type = rhs_type;
173     }
174 
175   *lhs_size_unit = lhs;
176   *rhs_size_unit = rhs;
177   return scalar_type;
178 }
179 
180 
181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
182    tested at run-time.  Return TRUE if DDR was successfully inserted.
183    Return false if versioning is not supported.  */
184 
185 static bool
186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
187 {
188   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189 
190   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
191     return false;
192 
193   if (dump_enabled_p ())
194     {
195       dump_printf_loc (MSG_NOTE, vect_location,
196                        "mark for run-time aliasing test between ");
197       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
198       dump_printf (MSG_NOTE,  " and ");
199       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
200       dump_printf (MSG_NOTE, "\n");
201     }
202 
203   if (optimize_loop_nest_for_size_p (loop))
204     {
205       if (dump_enabled_p ())
206 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
207                          "versioning not supported when optimizing"
208                          " for size.\n");
209       return false;
210     }
211 
212   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
213   if (loop->inner)
214     {
215       if (dump_enabled_p ())
216 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217                          "versioning not yet supported for outer-loops.\n");
218       return false;
219     }
220 
221   /* FORNOW: We don't support creating runtime alias tests for non-constant
222      step.  */
223   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
224       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
225     {
226       if (dump_enabled_p ())
227 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228                          "versioning not yet supported for non-constant "
229                          "step\n");
230       return false;
231     }
232 
233   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
234   return true;
235 }
236 
237 
238 /* Function vect_analyze_data_ref_dependence.
239 
240    Return TRUE if there (might) exist a dependence between a memory-reference
241    DRA and a memory-reference DRB.  When versioning for alias may check a
242    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
243    the data dependence.  */
244 
245 static bool
246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
247                                   loop_vec_info loop_vinfo, int *max_vf)
248 {
249   unsigned int i;
250   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
251   struct data_reference *dra = DDR_A (ddr);
252   struct data_reference *drb = DDR_B (ddr);
253   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
254   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
255   lambda_vector dist_v;
256   unsigned int loop_depth;
257 
258   /* In loop analysis all data references should be vectorizable.  */
259   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
260       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
261     gcc_unreachable ();
262 
263   /* Independent data accesses.  */
264   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
265     return false;
266 
267   if (dra == drb
268       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
269     return false;
270 
271   /* Even if we have an anti-dependence then, as the vectorized loop covers at
272      least two scalar iterations, there is always also a true dependence.
273      As the vectorizer does not re-order loads and stores we can ignore
274      the anti-dependence if TBAA can disambiguate both DRs similar to the
275      case with known negative distance anti-dependences (positive
276      distance anti-dependences would violate TBAA constraints).  */
277   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
278        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
279       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
280 				 get_alias_set (DR_REF (drb))))
281     return false;
282 
283   /* Unknown data dependence.  */
284   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
285     {
286       /* If user asserted safelen consecutive iterations can be
287 	 executed concurrently, assume independence.  */
288       if (loop->safelen >= 2)
289 	{
290 	  if (loop->safelen < *max_vf)
291 	    *max_vf = loop->safelen;
292 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
293 	  return false;
294 	}
295 
296       if (STMT_VINFO_GATHER_P (stmtinfo_a)
297 	  || STMT_VINFO_GATHER_P (stmtinfo_b))
298 	{
299 	  if (dump_enabled_p ())
300 	    {
301 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 			       "versioning for alias not supported for: "
303 			       "can't determine dependence between ");
304 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 				 DR_REF (dra));
306 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 				 DR_REF (drb));
309 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
310 	    }
311 	  return true;
312 	}
313 
314       if (dump_enabled_p ())
315 	{
316 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 			   "versioning for alias required: "
318 			   "can't determine dependence between ");
319 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 			     DR_REF (dra));
321 	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 			     DR_REF (drb));
324 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 	}
326 
327       /* Add to list of ddrs that need to be tested at run-time.  */
328       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
329     }
330 
331   /* Known data dependence.  */
332   if (DDR_NUM_DIST_VECTS (ddr) == 0)
333     {
334       /* If user asserted safelen consecutive iterations can be
335 	 executed concurrently, assume independence.  */
336       if (loop->safelen >= 2)
337 	{
338 	  if (loop->safelen < *max_vf)
339 	    *max_vf = loop->safelen;
340 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
341 	  return false;
342 	}
343 
344       if (STMT_VINFO_GATHER_P (stmtinfo_a)
345 	  || STMT_VINFO_GATHER_P (stmtinfo_b))
346 	{
347 	  if (dump_enabled_p ())
348 	    {
349 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 			       "versioning for alias not supported for: "
351 			       "bad dist vector for ");
352 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 				 DR_REF (dra));
354 	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
355 	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
356 				 DR_REF (drb));
357 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 	    }
359 	  return true;
360 	}
361 
362       if (dump_enabled_p ())
363         {
364           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365                            "versioning for alias required: "
366                            "bad dist vector for ");
367           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
368           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
369           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
370           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371         }
372       /* Add to list of ddrs that need to be tested at run-time.  */
373       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
374     }
375 
376   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
377   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
378     {
379       int dist = dist_v[loop_depth];
380 
381       if (dump_enabled_p ())
382 	dump_printf_loc (MSG_NOTE, vect_location,
383                          "dependence distance  = %d.\n", dist);
384 
385       if (dist == 0)
386 	{
387 	  if (dump_enabled_p ())
388 	    {
389 	      dump_printf_loc (MSG_NOTE, vect_location,
390 	                       "dependence distance == 0 between ");
391 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
392 	      dump_printf (MSG_NOTE, " and ");
393 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
394 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 	    }
396 
397 	  /* When we perform grouped accesses and perform implicit CSE
398 	     by detecting equal accesses and doing disambiguation with
399 	     runtime alias tests like for
400 	        .. = a[i];
401 		.. = a[i+1];
402 		a[i] = ..;
403 		a[i+1] = ..;
404 		*p = ..;
405 		.. = a[i];
406 		.. = a[i+1];
407 	     where we will end up loading { a[i], a[i+1] } once, make
408 	     sure that inserting group loads before the first load and
409 	     stores after the last store will do the right thing.
410 	     Similar for groups like
411 	        a[i] = ...;
412 		... = a[i];
413 		a[i+1] = ...;
414 	     where loads from the group interleave with the store.  */
415 	  if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
416 	      || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
417 	    {
418 	      gimple earlier_stmt;
419 	      earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
420 	      if (DR_IS_WRITE
421 		    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
422 		{
423 		  if (dump_enabled_p ())
424 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 				     "READ_WRITE dependence in interleaving."
426 				     "\n");
427 		  return true;
428 		}
429 	    }
430 
431 	  continue;
432 	}
433 
434       if (dist > 0 && DDR_REVERSED_P (ddr))
435 	{
436 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
437 	     reversed (to make distance vector positive), and the actual
438 	     distance is negative.  */
439 	  if (dump_enabled_p ())
440 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441 	                     "dependence distance negative.\n");
442 	  /* Record a negative dependence distance to later limit the
443 	     amount of stmt copying / unrolling we can perform.
444 	     Only need to handle read-after-write dependence.  */
445 	  if (DR_IS_READ (drb)
446 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
447 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
448 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
449 	  continue;
450 	}
451 
452       if (abs (dist) >= 2
453 	  && abs (dist) < *max_vf)
454 	{
455 	  /* The dependence distance requires reduction of the maximal
456 	     vectorization factor.  */
457 	  *max_vf = abs (dist);
458 	  if (dump_enabled_p ())
459 	    dump_printf_loc (MSG_NOTE, vect_location,
460 	                     "adjusting maximal vectorization factor to %i\n",
461 	                     *max_vf);
462 	}
463 
464       if (abs (dist) >= *max_vf)
465 	{
466 	  /* Dependence distance does not create dependence, as far as
467 	     vectorization is concerned, in this case.  */
468 	  if (dump_enabled_p ())
469 	    dump_printf_loc (MSG_NOTE, vect_location,
470 	                     "dependence distance >= VF.\n");
471 	  continue;
472 	}
473 
474       if (dump_enabled_p ())
475 	{
476 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 	               "not vectorized, possible dependence "
478 	               "between data-refs ");
479 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
480 	  dump_printf (MSG_NOTE,  " and ");
481 	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
482 	  dump_printf (MSG_NOTE,  "\n");
483 	}
484 
485       return true;
486     }
487 
488   return false;
489 }
490 
491 /* Function vect_analyze_data_ref_dependences.
492 
493    Examine all the data references in the loop, and make sure there do not
494    exist any data dependences between them.  Set *MAX_VF according to
495    the maximum vectorization factor the data dependences allow.  */
496 
497 bool
498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
499 {
500   unsigned int i;
501   struct data_dependence_relation *ddr;
502 
503   if (dump_enabled_p ())
504     dump_printf_loc (MSG_NOTE, vect_location,
505                      "=== vect_analyze_data_ref_dependences ===\n");
506 
507   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 				&LOOP_VINFO_DDRS (loop_vinfo),
510 				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511     return false;
512 
513   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
514     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
515       return false;
516 
517   return true;
518 }
519 
520 
521 /* Function vect_slp_analyze_data_ref_dependence.
522 
523    Return TRUE if there (might) exist a dependence between a memory-reference
524    DRA and a memory-reference DRB.  When versioning for alias may check a
525    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
526    the data dependence.  */
527 
528 static bool
529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
530 {
531   struct data_reference *dra = DDR_A (ddr);
532   struct data_reference *drb = DDR_B (ddr);
533 
534   /* We need to check dependences of statements marked as unvectorizable
535      as well, they still can prohibit vectorization.  */
536 
537   /* Independent data accesses.  */
538   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
539     return false;
540 
541   if (dra == drb)
542     return false;
543 
544   /* Read-read is OK.  */
545   if (DR_IS_READ (dra) && DR_IS_READ (drb))
546     return false;
547 
548   /* If dra and drb are part of the same interleaving chain consider
549      them independent.  */
550   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
551       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
552 	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
553     return false;
554 
555   /* Unknown data dependence.  */
556   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
557     {
558       if  (dump_enabled_p ())
559 	{
560 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
561 			   "can't determine dependence between ");
562 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
563 	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
564 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
565 	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
566 	}
567     }
568   else if (dump_enabled_p ())
569     {
570       dump_printf_loc (MSG_NOTE, vect_location,
571 		       "determined dependence between ");
572       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
573       dump_printf (MSG_NOTE, " and ");
574       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
575       dump_printf (MSG_NOTE,  "\n");
576     }
577 
578   /* We do not vectorize basic blocks with write-write dependencies.  */
579   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
580     return true;
581 
582   /* If we have a read-write dependence check that the load is before the store.
583      When we vectorize basic blocks, vector load can be only before
584      corresponding scalar load, and vector store can be only after its
585      corresponding scalar store.  So the order of the acceses is preserved in
586      case the load is before the store.  */
587   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
588   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
589     {
590       /* That only holds for load-store pairs taking part in vectorization.  */
591       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
592 	  && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
593 	return false;
594     }
595 
596   return true;
597 }
598 
599 
600 /* Function vect_analyze_data_ref_dependences.
601 
602    Examine all the data references in the basic-block, and make sure there
603    do not exist any data dependences between them.  Set *MAX_VF according to
604    the maximum vectorization factor the data dependences allow.  */
605 
606 bool
607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
608 {
609   struct data_dependence_relation *ddr;
610   unsigned int i;
611 
612   if (dump_enabled_p ())
613     dump_printf_loc (MSG_NOTE, vect_location,
614                      "=== vect_slp_analyze_data_ref_dependences ===\n");
615 
616   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
617 				&BB_VINFO_DDRS (bb_vinfo),
618 				vNULL, true))
619     return false;
620 
621   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
622     if (vect_slp_analyze_data_ref_dependence (ddr))
623       return false;
624 
625   return true;
626 }
627 
628 
629 /* Function vect_compute_data_ref_alignment
630 
631    Compute the misalignment of the data reference DR.
632 
633    Output:
634    1. If during the misalignment computation it is found that the data reference
635       cannot be vectorized then false is returned.
636    2. DR_MISALIGNMENT (DR) is defined.
637 
638    FOR NOW: No analysis is actually performed. Misalignment is calculated
639    only for trivial cases. TODO.  */
640 
641 static bool
642 vect_compute_data_ref_alignment (struct data_reference *dr)
643 {
644   gimple stmt = DR_STMT (dr);
645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
646   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
647   struct loop *loop = NULL;
648   tree ref = DR_REF (dr);
649   tree vectype;
650   tree base, base_addr;
651   tree misalign;
652   tree aligned_to;
653   unsigned HOST_WIDE_INT alignment;
654 
655   if (dump_enabled_p ())
656     dump_printf_loc (MSG_NOTE, vect_location,
657                      "vect_compute_data_ref_alignment:\n");
658 
659   if (loop_vinfo)
660     loop = LOOP_VINFO_LOOP (loop_vinfo);
661 
662   /* Initialize misalignment to unknown.  */
663   SET_DR_MISALIGNMENT (dr, -1);
664 
665   /* Strided loads perform only component accesses, misalignment information
666      is irrelevant for them.  */
667   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
668     return true;
669 
670   misalign = DR_INIT (dr);
671   aligned_to = DR_ALIGNED_TO (dr);
672   base_addr = DR_BASE_ADDRESS (dr);
673   vectype = STMT_VINFO_VECTYPE (stmt_info);
674 
675   /* In case the dataref is in an inner-loop of the loop that is being
676      vectorized (LOOP), we use the base and misalignment information
677      relative to the outer-loop (LOOP).  This is ok only if the misalignment
678      stays the same throughout the execution of the inner-loop, which is why
679      we have to check that the stride of the dataref in the inner-loop evenly
680      divides by the vector size.  */
681   if (loop && nested_in_vect_loop_p (loop, stmt))
682     {
683       tree step = DR_STEP (dr);
684       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
685 
686       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
687         {
688           if (dump_enabled_p ())
689             dump_printf_loc (MSG_NOTE, vect_location,
690                              "inner step divides the vector-size.\n");
691 	  misalign = STMT_VINFO_DR_INIT (stmt_info);
692 	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
693 	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
694         }
695       else
696 	{
697 	  if (dump_enabled_p ())
698 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699 	                     "inner step doesn't divide the vector-size.\n");
700 	  misalign = NULL_TREE;
701 	}
702     }
703 
704   /* Similarly, if we're doing basic-block vectorization, we can only use
705      base and misalignment information relative to an innermost loop if the
706      misalignment stays the same throughout the execution of the loop.
707      As above, this is the case if the stride of the dataref evenly divides
708      by the vector size.  */
709   if (!loop)
710     {
711       tree step = DR_STEP (dr);
712       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
713 
714       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
715 	{
716 	  if (dump_enabled_p ())
717 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718 	                     "SLP: step doesn't divide the vector-size.\n");
719 	  misalign = NULL_TREE;
720 	}
721     }
722 
723   /* To look at alignment of the base we have to preserve an inner MEM_REF
724      as that carries alignment information of the actual access.  */
725   base = ref;
726   while (handled_component_p (base))
727     base = TREE_OPERAND (base, 0);
728   if (TREE_CODE (base) == MEM_REF)
729     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
730 		   build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
731   unsigned int base_alignment = get_object_alignment (base);
732 
733   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
734     DR_VECT_AUX (dr)->base_element_aligned = true;
735 
736   alignment = TYPE_ALIGN_UNIT (vectype);
737 
738   if ((compare_tree_int (aligned_to, alignment) < 0)
739       || !misalign)
740     {
741       if (dump_enabled_p ())
742 	{
743 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 	                   "Unknown alignment for access: ");
745 	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
746 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
747 	}
748       return true;
749     }
750 
751   if (base_alignment < TYPE_ALIGN (vectype))
752     {
753       /* Strip an inner MEM_REF to a bare decl if possible.  */
754       if (TREE_CODE (base) == MEM_REF
755 	  && integer_zerop (TREE_OPERAND (base, 1))
756 	  && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
757 	base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
758 
759       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
760 	{
761 	  if (dump_enabled_p ())
762 	    {
763 	      dump_printf_loc (MSG_NOTE, vect_location,
764 	                       "can't force alignment of ref: ");
765 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766 	      dump_printf (MSG_NOTE, "\n");
767 	    }
768 	  return true;
769 	}
770 
771       /* Force the alignment of the decl.
772 	 NOTE: This is the only change to the code we make during
773 	 the analysis phase, before deciding to vectorize the loop.  */
774       if (dump_enabled_p ())
775         {
776           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
777           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
778           dump_printf (MSG_NOTE, "\n");
779         }
780 
781       DR_VECT_AUX (dr)->base_decl = base;
782       DR_VECT_AUX (dr)->base_misaligned = true;
783       DR_VECT_AUX (dr)->base_element_aligned = true;
784     }
785 
786   /* If this is a backward running DR then first access in the larger
787      vectype actually is N-1 elements before the address in the DR.
788      Adjust misalign accordingly.  */
789   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
790     {
791       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
792       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
793 	 otherwise we wouldn't be here.  */
794       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
795       /* PLUS because DR_STEP was negative.  */
796       misalign = size_binop (PLUS_EXPR, misalign, offset);
797     }
798 
799   SET_DR_MISALIGNMENT (dr,
800 		       wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
801 
802   if (dump_enabled_p ())
803     {
804       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
806       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
807       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
808     }
809 
810   return true;
811 }
812 
813 
814 /* Function vect_compute_data_refs_alignment
815 
816    Compute the misalignment of data references in the loop.
817    Return FALSE if a data reference is found that cannot be vectorized.  */
818 
819 static bool
820 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
821                                   bb_vec_info bb_vinfo)
822 {
823   vec<data_reference_p> datarefs;
824   struct data_reference *dr;
825   unsigned int i;
826 
827   if (loop_vinfo)
828     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
829   else
830     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
831 
832   FOR_EACH_VEC_ELT (datarefs, i, dr)
833     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
834         && !vect_compute_data_ref_alignment (dr))
835       {
836         if (bb_vinfo)
837           {
838             /* Mark unsupported statement as unvectorizable.  */
839             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
840             continue;
841           }
842         else
843           return false;
844       }
845 
846   return true;
847 }
848 
849 
850 /* Function vect_update_misalignment_for_peel
851 
852    DR - the data reference whose misalignment is to be adjusted.
853    DR_PEEL - the data reference whose misalignment is being made
854              zero in the vector loop by the peel.
855    NPEEL - the number of iterations in the peel loop if the misalignment
856            of DR_PEEL is known at compile time.  */
857 
858 static void
859 vect_update_misalignment_for_peel (struct data_reference *dr,
860                                    struct data_reference *dr_peel, int npeel)
861 {
862   unsigned int i;
863   vec<dr_p> same_align_drs;
864   struct data_reference *current_dr;
865   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
866   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
867   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
868   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
869 
870  /* For interleaved data accesses the step in the loop must be multiplied by
871      the size of the interleaving group.  */
872   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
873     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
874   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
875     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
876 
877   /* It can be assumed that the data refs with the same alignment as dr_peel
878      are aligned in the vector loop.  */
879   same_align_drs
880     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
881   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
882     {
883       if (current_dr != dr)
884         continue;
885       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
886                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
887       SET_DR_MISALIGNMENT (dr, 0);
888       return;
889     }
890 
891   if (known_alignment_for_access_p (dr)
892       && known_alignment_for_access_p (dr_peel))
893     {
894       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
895       int misal = DR_MISALIGNMENT (dr);
896       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
897       misal += negative ? -npeel * dr_size : npeel * dr_size;
898       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
899       SET_DR_MISALIGNMENT (dr, misal);
900       return;
901     }
902 
903   if (dump_enabled_p ())
904     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
905   SET_DR_MISALIGNMENT (dr, -1);
906 }
907 
908 
909 /* Function vect_verify_datarefs_alignment
910 
911    Return TRUE if all data references in the loop can be
912    handled with respect to alignment.  */
913 
914 bool
915 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
916 {
917   vec<data_reference_p> datarefs;
918   struct data_reference *dr;
919   enum dr_alignment_support supportable_dr_alignment;
920   unsigned int i;
921 
922   if (loop_vinfo)
923     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
924   else
925     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
926 
927   FOR_EACH_VEC_ELT (datarefs, i, dr)
928     {
929       gimple stmt = DR_STMT (dr);
930       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
931 
932       if (!STMT_VINFO_RELEVANT_P (stmt_info))
933 	continue;
934 
935       /* For interleaving, only the alignment of the first access matters.
936          Skip statements marked as not vectorizable.  */
937       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
938            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
939           || !STMT_VINFO_VECTORIZABLE (stmt_info))
940         continue;
941 
942       /* Strided loads perform only component accesses, alignment is
943 	 irrelevant for them.  */
944       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
945 	continue;
946 
947       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
948       if (!supportable_dr_alignment)
949         {
950           if (dump_enabled_p ())
951             {
952               if (DR_IS_READ (dr))
953                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954                                  "not vectorized: unsupported unaligned load.");
955               else
956                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957                                  "not vectorized: unsupported unaligned "
958                                  "store.");
959 
960               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
961                                  DR_REF (dr));
962               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
963             }
964           return false;
965         }
966       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
967         dump_printf_loc (MSG_NOTE, vect_location,
968                          "Vectorizing an unaligned access.\n");
969     }
970   return true;
971 }
972 
973 /* Given an memory reference EXP return whether its alignment is less
974    than its size.  */
975 
976 static bool
977 not_size_aligned (tree exp)
978 {
979   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
980     return true;
981 
982   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
983 	  > get_object_alignment (exp));
984 }
985 
986 /* Function vector_alignment_reachable_p
987 
988    Return true if vector alignment for DR is reachable by peeling
989    a few loop iterations.  Return false otherwise.  */
990 
991 static bool
992 vector_alignment_reachable_p (struct data_reference *dr)
993 {
994   gimple stmt = DR_STMT (dr);
995   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
996   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
997 
998   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
999     {
1000       /* For interleaved access we peel only if number of iterations in
1001 	 the prolog loop ({VF - misalignment}), is a multiple of the
1002 	 number of the interleaved accesses.  */
1003       int elem_size, mis_in_elements;
1004       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1005 
1006       /* FORNOW: handle only known alignment.  */
1007       if (!known_alignment_for_access_p (dr))
1008 	return false;
1009 
1010       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1011       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1012 
1013       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1014 	return false;
1015     }
1016 
1017   /* If misalignment is known at the compile time then allow peeling
1018      only if natural alignment is reachable through peeling.  */
1019   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1020     {
1021       HOST_WIDE_INT elmsize =
1022 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1023       if (dump_enabled_p ())
1024 	{
1025 	  dump_printf_loc (MSG_NOTE, vect_location,
1026 	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1027 	  dump_printf (MSG_NOTE,
1028 	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1029 	}
1030       if (DR_MISALIGNMENT (dr) % elmsize)
1031 	{
1032 	  if (dump_enabled_p ())
1033 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 	                     "data size does not divide the misalignment.\n");
1035 	  return false;
1036 	}
1037     }
1038 
1039   if (!known_alignment_for_access_p (dr))
1040     {
1041       tree type = TREE_TYPE (DR_REF (dr));
1042       bool is_packed = not_size_aligned (DR_REF (dr));
1043       if (dump_enabled_p ())
1044 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1045 	                 "Unknown misalignment, is_packed = %d\n",is_packed);
1046       if ((TYPE_USER_ALIGN (type) && !is_packed)
1047 	  || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1048 	return true;
1049       else
1050 	return false;
1051     }
1052 
1053   return true;
1054 }
1055 
1056 
1057 /* Calculate the cost of the memory access represented by DR.  */
1058 
1059 static void
1060 vect_get_data_access_cost (struct data_reference *dr,
1061                            unsigned int *inside_cost,
1062                            unsigned int *outside_cost,
1063 			   stmt_vector_for_cost *body_cost_vec)
1064 {
1065   gimple stmt = DR_STMT (dr);
1066   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1067   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1068   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1069   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1070   int ncopies = vf / nunits;
1071 
1072   if (DR_IS_READ (dr))
1073     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1074 			NULL, body_cost_vec, false);
1075   else
1076     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1077 
1078   if (dump_enabled_p ())
1079     dump_printf_loc (MSG_NOTE, vect_location,
1080                      "vect_get_data_access_cost: inside_cost = %d, "
1081                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1082 }
1083 
1084 
1085 /* Insert DR into peeling hash table with NPEEL as key.  */
1086 
1087 static void
1088 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1089                           int npeel)
1090 {
1091   struct _vect_peel_info elem, *slot;
1092   _vect_peel_info **new_slot;
1093   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1094 
1095   elem.npeel = npeel;
1096   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1097   if (slot)
1098     slot->count++;
1099   else
1100     {
1101       slot = XNEW (struct _vect_peel_info);
1102       slot->npeel = npeel;
1103       slot->dr = dr;
1104       slot->count = 1;
1105       new_slot
1106        	= LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1107       *new_slot = slot;
1108     }
1109 
1110   if (!supportable_dr_alignment
1111       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1112     slot->count += VECT_MAX_COST;
1113 }
1114 
1115 
1116 /* Traverse peeling hash table to find peeling option that aligns maximum
1117    number of data accesses.  */
1118 
1119 int
1120 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1121 				     _vect_peel_extended_info *max)
1122 {
1123   vect_peel_info elem = *slot;
1124 
1125   if (elem->count > max->peel_info.count
1126       || (elem->count == max->peel_info.count
1127           && max->peel_info.npeel > elem->npeel))
1128     {
1129       max->peel_info.npeel = elem->npeel;
1130       max->peel_info.count = elem->count;
1131       max->peel_info.dr = elem->dr;
1132     }
1133 
1134   return 1;
1135 }
1136 
1137 
1138 /* Traverse peeling hash table and calculate cost for each peeling option.
1139    Find the one with the lowest cost.  */
1140 
1141 int
1142 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1143 				   _vect_peel_extended_info *min)
1144 {
1145   vect_peel_info elem = *slot;
1146   int save_misalignment, dummy;
1147   unsigned int inside_cost = 0, outside_cost = 0, i;
1148   gimple stmt = DR_STMT (elem->dr);
1149   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1150   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1151   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1152   struct data_reference *dr;
1153   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1154 
1155   prologue_cost_vec.create (2);
1156   body_cost_vec.create (2);
1157   epilogue_cost_vec.create (2);
1158 
1159   FOR_EACH_VEC_ELT (datarefs, i, dr)
1160     {
1161       stmt = DR_STMT (dr);
1162       stmt_info = vinfo_for_stmt (stmt);
1163       /* For interleaving, only the alignment of the first access
1164          matters.  */
1165       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1166           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1167         continue;
1168 
1169       save_misalignment = DR_MISALIGNMENT (dr);
1170       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1171       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1172 				 &body_cost_vec);
1173       SET_DR_MISALIGNMENT (dr, save_misalignment);
1174     }
1175 
1176   auto_vec<stmt_info_for_cost> scalar_cost_vec;
1177   vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
1178   outside_cost += vect_get_known_peeling_cost
1179     (loop_vinfo, elem->npeel, &dummy,
1180      &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec);
1181 
1182   /* Prologue and epilogue costs are added to the target model later.
1183      These costs depend only on the scalar iteration cost, the
1184      number of peeling iterations finally chosen, and the number of
1185      misaligned statements.  So discard the information found here.  */
1186   prologue_cost_vec.release ();
1187   epilogue_cost_vec.release ();
1188 
1189   if (inside_cost < min->inside_cost
1190       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1191     {
1192       min->inside_cost = inside_cost;
1193       min->outside_cost = outside_cost;
1194       min->body_cost_vec.release ();
1195       min->body_cost_vec = body_cost_vec;
1196       min->peel_info.dr = elem->dr;
1197       min->peel_info.npeel = elem->npeel;
1198     }
1199   else
1200     body_cost_vec.release ();
1201 
1202   return 1;
1203 }
1204 
1205 
1206 /* Choose best peeling option by traversing peeling hash table and either
1207    choosing an option with the lowest cost (if cost model is enabled) or the
1208    option that aligns as many accesses as possible.  */
1209 
1210 static struct data_reference *
1211 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1212                                        unsigned int *npeel,
1213 				       stmt_vector_for_cost *body_cost_vec)
1214 {
1215    struct _vect_peel_extended_info res;
1216 
1217    res.peel_info.dr = NULL;
1218    res.body_cost_vec = stmt_vector_for_cost ();
1219 
1220    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1221      {
1222        res.inside_cost = INT_MAX;
1223        res.outside_cost = INT_MAX;
1224        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1225            ->traverse <_vect_peel_extended_info *,
1226                        vect_peeling_hash_get_lowest_cost> (&res);
1227      }
1228    else
1229      {
1230        res.peel_info.count = 0;
1231        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1232            ->traverse <_vect_peel_extended_info *,
1233                        vect_peeling_hash_get_most_frequent> (&res);
1234      }
1235 
1236    *npeel = res.peel_info.npeel;
1237    *body_cost_vec = res.body_cost_vec;
1238    return res.peel_info.dr;
1239 }
1240 
1241 
1242 /* Function vect_enhance_data_refs_alignment
1243 
1244    This pass will use loop versioning and loop peeling in order to enhance
1245    the alignment of data references in the loop.
1246 
1247    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1248    original loop is to be vectorized.  Any other loops that are created by
1249    the transformations performed in this pass - are not supposed to be
1250    vectorized.  This restriction will be relaxed.
1251 
1252    This pass will require a cost model to guide it whether to apply peeling
1253    or versioning or a combination of the two.  For example, the scheme that
1254    intel uses when given a loop with several memory accesses, is as follows:
1255    choose one memory access ('p') which alignment you want to force by doing
1256    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1257    other accesses are not necessarily aligned, or (2) use loop versioning to
1258    generate one loop in which all accesses are aligned, and another loop in
1259    which only 'p' is necessarily aligned.
1260 
1261    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1262    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1263    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1264 
1265    Devising a cost model is the most critical aspect of this work.  It will
1266    guide us on which access to peel for, whether to use loop versioning, how
1267    many versions to create, etc.  The cost model will probably consist of
1268    generic considerations as well as target specific considerations (on
1269    powerpc for example, misaligned stores are more painful than misaligned
1270    loads).
1271 
1272    Here are the general steps involved in alignment enhancements:
1273 
1274      -- original loop, before alignment analysis:
1275 	for (i=0; i<N; i++){
1276 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1277 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1278 	}
1279 
1280      -- After vect_compute_data_refs_alignment:
1281 	for (i=0; i<N; i++){
1282 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1283 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1284 	}
1285 
1286      -- Possibility 1: we do loop versioning:
1287      if (p is aligned) {
1288 	for (i=0; i<N; i++){	# loop 1A
1289 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1290 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1291 	}
1292      }
1293      else {
1294 	for (i=0; i<N; i++){	# loop 1B
1295 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1296 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1297 	}
1298      }
1299 
1300      -- Possibility 2: we do loop peeling:
1301      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1302 	x = q[i];
1303 	p[i] = y;
1304      }
1305      for (i = 3; i < N; i++){	# loop 2A
1306 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1307 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1308      }
1309 
1310      -- Possibility 3: combination of loop peeling and versioning:
1311      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1312 	x = q[i];
1313 	p[i] = y;
1314      }
1315      if (p is aligned) {
1316 	for (i = 3; i<N; i++){	# loop 3A
1317 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1318 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1319 	}
1320      }
1321      else {
1322 	for (i = 3; i<N; i++){	# loop 3B
1323 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1324 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1325 	}
1326      }
1327 
1328      These loops are later passed to loop_transform to be vectorized.  The
1329      vectorizer will use the alignment information to guide the transformation
1330      (whether to generate regular loads/stores, or with special handling for
1331      misalignment).  */
1332 
1333 bool
1334 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1335 {
1336   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1337   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1338   enum dr_alignment_support supportable_dr_alignment;
1339   struct data_reference *dr0 = NULL, *first_store = NULL;
1340   struct data_reference *dr;
1341   unsigned int i, j;
1342   bool do_peeling = false;
1343   bool do_versioning = false;
1344   bool stat;
1345   gimple stmt;
1346   stmt_vec_info stmt_info;
1347   unsigned int npeel = 0;
1348   bool all_misalignments_unknown = true;
1349   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1350   unsigned possible_npeel_number = 1;
1351   tree vectype;
1352   unsigned int nelements, mis, same_align_drs_max = 0;
1353   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1354 
1355   if (dump_enabled_p ())
1356     dump_printf_loc (MSG_NOTE, vect_location,
1357                      "=== vect_enhance_data_refs_alignment ===\n");
1358 
1359   /* While cost model enhancements are expected in the future, the high level
1360      view of the code at this time is as follows:
1361 
1362      A) If there is a misaligned access then see if peeling to align
1363         this access can make all data references satisfy
1364         vect_supportable_dr_alignment.  If so, update data structures
1365         as needed and return true.
1366 
1367      B) If peeling wasn't possible and there is a data reference with an
1368         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1369         then see if loop versioning checks can be used to make all data
1370         references satisfy vect_supportable_dr_alignment.  If so, update
1371         data structures as needed and return true.
1372 
1373      C) If neither peeling nor versioning were successful then return false if
1374         any data reference does not satisfy vect_supportable_dr_alignment.
1375 
1376      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377 
1378      Note, Possibility 3 above (which is peeling and versioning together) is not
1379      being done at this time.  */
1380 
1381   /* (1) Peeling to force alignment.  */
1382 
1383   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1384      Considerations:
1385      + How many accesses will become aligned due to the peeling
1386      - How many accesses will become unaligned due to the peeling,
1387        and the cost of misaligned accesses.
1388      - The cost of peeling (the extra runtime checks, the increase
1389        in code size).  */
1390 
1391   FOR_EACH_VEC_ELT (datarefs, i, dr)
1392     {
1393       stmt = DR_STMT (dr);
1394       stmt_info = vinfo_for_stmt (stmt);
1395 
1396       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1397 	continue;
1398 
1399       /* For interleaving, only the alignment of the first access
1400          matters.  */
1401       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1402           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1403         continue;
1404 
1405       /* For invariant accesses there is nothing to enhance.  */
1406       if (integer_zerop (DR_STEP (dr)))
1407 	continue;
1408 
1409       /* Strided loads perform only component accesses, alignment is
1410 	 irrelevant for them.  */
1411       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1412 	continue;
1413 
1414       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1415       do_peeling = vector_alignment_reachable_p (dr);
1416       if (do_peeling)
1417         {
1418           if (known_alignment_for_access_p (dr))
1419             {
1420               unsigned int npeel_tmp;
1421 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1422 						    size_zero_node) < 0;
1423 
1424               /* Save info about DR in the hash table.  */
1425               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1426                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1427 		  = new hash_table<peel_info_hasher> (1);
1428 
1429               vectype = STMT_VINFO_VECTYPE (stmt_info);
1430               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1431               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1432                                                 TREE_TYPE (DR_REF (dr))));
1433               npeel_tmp = (negative
1434 			   ? (mis - nelements) : (nelements - mis))
1435 		  & (nelements - 1);
1436 
1437               /* For multiple types, it is possible that the bigger type access
1438                  will have more than one peeling option.  E.g., a loop with two
1439                  types: one of size (vector size / 4), and the other one of
1440                  size (vector size / 8).  Vectorization factor will 8.  If both
1441                  access are misaligned by 3, the first one needs one scalar
1442                  iteration to be aligned, and the second one needs 5.  But the
1443                  the first one will be aligned also by peeling 5 scalar
1444                  iterations, and in that case both accesses will be aligned.
1445                  Hence, except for the immediate peeling amount, we also want
1446                  to try to add full vector size, while we don't exceed
1447                  vectorization factor.
1448                  We do this automtically for cost model, since we calculate cost
1449                  for every peeling option.  */
1450               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1451                 possible_npeel_number = vf /nelements;
1452 
1453               /* Handle the aligned case. We may decide to align some other
1454                  access, making DR unaligned.  */
1455               if (DR_MISALIGNMENT (dr) == 0)
1456                 {
1457                   npeel_tmp = 0;
1458                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1459                     possible_npeel_number++;
1460                 }
1461 
1462               for (j = 0; j < possible_npeel_number; j++)
1463                 {
1464                   gcc_assert (npeel_tmp <= vf);
1465                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1466                   npeel_tmp += nelements;
1467                 }
1468 
1469               all_misalignments_unknown = false;
1470               /* Data-ref that was chosen for the case that all the
1471                  misalignments are unknown is not relevant anymore, since we
1472                  have a data-ref with known alignment.  */
1473               dr0 = NULL;
1474             }
1475           else
1476             {
1477               /* If we don't know any misalignment values, we prefer
1478                  peeling for data-ref that has the maximum number of data-refs
1479                  with the same alignment, unless the target prefers to align
1480                  stores over load.  */
1481               if (all_misalignments_unknown)
1482                 {
1483 		  unsigned same_align_drs
1484 		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1485                   if (!dr0
1486 		      || same_align_drs_max < same_align_drs)
1487                     {
1488                       same_align_drs_max = same_align_drs;
1489                       dr0 = dr;
1490                     }
1491 		  /* For data-refs with the same number of related
1492 		     accesses prefer the one where the misalign
1493 		     computation will be invariant in the outermost loop.  */
1494 		  else if (same_align_drs_max == same_align_drs)
1495 		    {
1496 		      struct loop *ivloop0, *ivloop;
1497 		      ivloop0 = outermost_invariant_loop_for_expr
1498 			  (loop, DR_BASE_ADDRESS (dr0));
1499 		      ivloop = outermost_invariant_loop_for_expr
1500 			  (loop, DR_BASE_ADDRESS (dr));
1501 		      if ((ivloop && !ivloop0)
1502 			  || (ivloop && ivloop0
1503 			      && flow_loop_nested_p (ivloop, ivloop0)))
1504 			dr0 = dr;
1505 		    }
1506 
1507                   if (!first_store && DR_IS_WRITE (dr))
1508                     first_store = dr;
1509                 }
1510 
1511               /* If there are both known and unknown misaligned accesses in the
1512                  loop, we choose peeling amount according to the known
1513                  accesses.  */
1514               if (!supportable_dr_alignment)
1515                 {
1516                   dr0 = dr;
1517                   if (!first_store && DR_IS_WRITE (dr))
1518                     first_store = dr;
1519                 }
1520             }
1521         }
1522       else
1523         {
1524           if (!aligned_access_p (dr))
1525             {
1526               if (dump_enabled_p ())
1527                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1528                                  "vector alignment may not be reachable\n");
1529               break;
1530             }
1531         }
1532     }
1533 
1534   /* Check if we can possibly peel the loop.  */
1535   if (!vect_can_advance_ivs_p (loop_vinfo)
1536       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1537     do_peeling = false;
1538 
1539   /* If we don't know how many times the peeling loop will run
1540      assume it will run VF-1 times and disable peeling if the remaining
1541      iters are less than the vectorization factor.  */
1542   if (do_peeling
1543       && all_misalignments_unknown
1544       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1546 	  < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1547     do_peeling = false;
1548 
1549   if (do_peeling
1550       && all_misalignments_unknown
1551       && vect_supportable_dr_alignment (dr0, false))
1552     {
1553       /* Check if the target requires to prefer stores over loads, i.e., if
1554          misaligned stores are more expensive than misaligned loads (taking
1555          drs with same alignment into account).  */
1556       if (first_store && DR_IS_READ (dr0))
1557         {
1558           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1559           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1560           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1561           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1562 	  stmt_vector_for_cost dummy;
1563 	  dummy.create (2);
1564 
1565           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1566 				     &dummy);
1567           vect_get_data_access_cost (first_store, &store_inside_cost,
1568 				     &store_outside_cost, &dummy);
1569 
1570 	  dummy.release ();
1571 
1572           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1573              aligning the load DR0).  */
1574           load_inside_penalty = store_inside_cost;
1575           load_outside_penalty = store_outside_cost;
1576           for (i = 0;
1577 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1578 			  DR_STMT (first_store))).iterate (i, &dr);
1579                i++)
1580             if (DR_IS_READ (dr))
1581               {
1582                 load_inside_penalty += load_inside_cost;
1583                 load_outside_penalty += load_outside_cost;
1584               }
1585             else
1586               {
1587                 load_inside_penalty += store_inside_cost;
1588                 load_outside_penalty += store_outside_cost;
1589               }
1590 
1591           /* Calculate the penalty for leaving DR0 unaligned (by
1592              aligning the FIRST_STORE).  */
1593           store_inside_penalty = load_inside_cost;
1594           store_outside_penalty = load_outside_cost;
1595           for (i = 0;
1596 	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1597 		      DR_STMT (dr0))).iterate (i, &dr);
1598                i++)
1599             if (DR_IS_READ (dr))
1600               {
1601                 store_inside_penalty += load_inside_cost;
1602                 store_outside_penalty += load_outside_cost;
1603               }
1604             else
1605               {
1606                 store_inside_penalty += store_inside_cost;
1607                 store_outside_penalty += store_outside_cost;
1608               }
1609 
1610           if (load_inside_penalty > store_inside_penalty
1611               || (load_inside_penalty == store_inside_penalty
1612                   && load_outside_penalty > store_outside_penalty))
1613             dr0 = first_store;
1614         }
1615 
1616       /* In case there are only loads with different unknown misalignments, use
1617          peeling only if it may help to align other accesses in the loop.  */
1618       if (!first_store
1619 	  && !STMT_VINFO_SAME_ALIGN_REFS (
1620 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1621           && vect_supportable_dr_alignment (dr0, false)
1622               != dr_unaligned_supported)
1623         do_peeling = false;
1624     }
1625 
1626   if (do_peeling && !dr0)
1627     {
1628       /* Peeling is possible, but there is no data access that is not supported
1629          unless aligned. So we try to choose the best possible peeling.  */
1630 
1631       /* We should get here only if there are drs with known misalignment.  */
1632       gcc_assert (!all_misalignments_unknown);
1633 
1634       /* Choose the best peeling from the hash table.  */
1635       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1636 						   &body_cost_vec);
1637       if (!dr0 || !npeel)
1638         do_peeling = false;
1639 
1640       /* If peeling by npeel will result in a remaining loop not iterating
1641          enough to be vectorized then do not peel.  */
1642       if (do_peeling
1643 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1644 	  && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1645 	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1646 	do_peeling = false;
1647     }
1648 
1649   if (do_peeling)
1650     {
1651       stmt = DR_STMT (dr0);
1652       stmt_info = vinfo_for_stmt (stmt);
1653       vectype = STMT_VINFO_VECTYPE (stmt_info);
1654       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1655 
1656       if (known_alignment_for_access_p (dr0))
1657         {
1658 	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1659 						size_zero_node) < 0;
1660           if (!npeel)
1661             {
1662               /* Since it's known at compile time, compute the number of
1663                  iterations in the peeled loop (the peeling factor) for use in
1664                  updating DR_MISALIGNMENT values.  The peeling factor is the
1665                  vectorization factor minus the misalignment as an element
1666                  count.  */
1667               mis = DR_MISALIGNMENT (dr0);
1668               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1669               npeel = ((negative ? mis - nelements : nelements - mis)
1670 		       & (nelements - 1));
1671             }
1672 
1673 	  /* For interleaved data access every iteration accesses all the
1674 	     members of the group, therefore we divide the number of iterations
1675 	     by the group size.  */
1676 	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1677 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1678 	    npeel /= GROUP_SIZE (stmt_info);
1679 
1680           if (dump_enabled_p ())
1681             dump_printf_loc (MSG_NOTE, vect_location,
1682                              "Try peeling by %d\n", npeel);
1683         }
1684 
1685       /* Ensure that all data refs can be vectorized after the peel.  */
1686       FOR_EACH_VEC_ELT (datarefs, i, dr)
1687         {
1688           int save_misalignment;
1689 
1690 	  if (dr == dr0)
1691 	    continue;
1692 
1693 	  stmt = DR_STMT (dr);
1694 	  stmt_info = vinfo_for_stmt (stmt);
1695 	  /* For interleaving, only the alignment of the first access
1696             matters.  */
1697 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1698 	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1699 	    continue;
1700 
1701 	  /* Strided loads perform only component accesses, alignment is
1702 	     irrelevant for them.  */
1703 	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1704 	    continue;
1705 
1706 	  save_misalignment = DR_MISALIGNMENT (dr);
1707 	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1708 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1709 	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1710 
1711 	  if (!supportable_dr_alignment)
1712 	    {
1713 	      do_peeling = false;
1714 	      break;
1715 	    }
1716 	}
1717 
1718       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1719         {
1720           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1721           if (!stat)
1722             do_peeling = false;
1723           else
1724 	    {
1725 	      body_cost_vec.release ();
1726 	      return stat;
1727 	    }
1728         }
1729 
1730       if (do_peeling)
1731         {
1732           unsigned max_allowed_peel
1733             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1734           if (max_allowed_peel != (unsigned)-1)
1735             {
1736               unsigned max_peel = npeel;
1737               if (max_peel == 0)
1738                 {
1739                   gimple dr_stmt = DR_STMT (dr0);
1740                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1741                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1742                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1743                 }
1744               if (max_peel > max_allowed_peel)
1745                 {
1746                   do_peeling = false;
1747                   if (dump_enabled_p ())
1748                     dump_printf_loc (MSG_NOTE, vect_location,
1749                         "Disable peeling, max peels reached: %d\n", max_peel);
1750                 }
1751             }
1752         }
1753 
1754       if (do_peeling)
1755         {
1756           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1757              If the misalignment of DR_i is identical to that of dr0 then set
1758              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1759              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1760              by the peeling factor times the element size of DR_i (MOD the
1761              vectorization factor times the size).  Otherwise, the
1762              misalignment of DR_i must be set to unknown.  */
1763 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1764 	    if (dr != dr0)
1765 	      vect_update_misalignment_for_peel (dr, dr0, npeel);
1766 
1767           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1768           if (npeel)
1769             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1770           else
1771             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1772 	      = DR_MISALIGNMENT (dr0);
1773 	  SET_DR_MISALIGNMENT (dr0, 0);
1774 	  if (dump_enabled_p ())
1775             {
1776               dump_printf_loc (MSG_NOTE, vect_location,
1777                                "Alignment of access forced using peeling.\n");
1778               dump_printf_loc (MSG_NOTE, vect_location,
1779                                "Peeling for alignment will be applied.\n");
1780             }
1781 	  /* The inside-loop cost will be accounted for in vectorizable_load
1782 	     and vectorizable_store correctly with adjusted alignments.
1783 	     Drop the body_cst_vec on the floor here.  */
1784 	  body_cost_vec.release ();
1785 
1786 	  stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1787 	  gcc_assert (stat);
1788           return stat;
1789         }
1790     }
1791 
1792   body_cost_vec.release ();
1793 
1794   /* (2) Versioning to force alignment.  */
1795 
1796   /* Try versioning if:
1797      1) optimize loop for speed
1798      2) there is at least one unsupported misaligned data ref with an unknown
1799         misalignment, and
1800      3) all misaligned data refs with a known misalignment are supported, and
1801      4) the number of runtime alignment checks is within reason.  */
1802 
1803   do_versioning =
1804 	optimize_loop_nest_for_speed_p (loop)
1805 	&& (!loop->inner); /* FORNOW */
1806 
1807   if (do_versioning)
1808     {
1809       FOR_EACH_VEC_ELT (datarefs, i, dr)
1810         {
1811 	  stmt = DR_STMT (dr);
1812 	  stmt_info = vinfo_for_stmt (stmt);
1813 
1814 	  /* For interleaving, only the alignment of the first access
1815 	     matters.  */
1816 	  if (aligned_access_p (dr)
1817 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1818 		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1819 	    continue;
1820 
1821 	  /* Strided loads perform only component accesses, alignment is
1822 	     irrelevant for them.  */
1823 	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1824 	    continue;
1825 
1826 	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1827 
1828           if (!supportable_dr_alignment)
1829             {
1830               gimple stmt;
1831               int mask;
1832               tree vectype;
1833 
1834               if (known_alignment_for_access_p (dr)
1835                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1836                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1837                 {
1838                   do_versioning = false;
1839                   break;
1840                 }
1841 
1842               stmt = DR_STMT (dr);
1843               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1844               gcc_assert (vectype);
1845 
1846               /* The rightmost bits of an aligned address must be zeros.
1847                  Construct the mask needed for this test.  For example,
1848                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1849                  mask must be 15 = 0xf. */
1850               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1851 
1852               /* FORNOW: use the same mask to test all potentially unaligned
1853                  references in the loop.  The vectorizer currently supports
1854                  a single vector size, see the reference to
1855                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1856                  vectorization factor is computed.  */
1857               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1858                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1859               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1860               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1861 		      DR_STMT (dr));
1862             }
1863         }
1864 
1865       /* Versioning requires at least one misaligned data reference.  */
1866       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1867         do_versioning = false;
1868       else if (!do_versioning)
1869         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1870     }
1871 
1872   if (do_versioning)
1873     {
1874       vec<gimple> may_misalign_stmts
1875         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1876       gimple stmt;
1877 
1878       /* It can now be assumed that the data references in the statements
1879          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1880          of the loop being vectorized.  */
1881       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1882         {
1883           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1884           dr = STMT_VINFO_DATA_REF (stmt_info);
1885 	  SET_DR_MISALIGNMENT (dr, 0);
1886 	  if (dump_enabled_p ())
1887             dump_printf_loc (MSG_NOTE, vect_location,
1888                              "Alignment of access forced using versioning.\n");
1889         }
1890 
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_NOTE, vect_location,
1893                          "Versioning for alignment will be applied.\n");
1894 
1895       /* Peeling and versioning can't be done together at this time.  */
1896       gcc_assert (! (do_peeling && do_versioning));
1897 
1898       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1899       gcc_assert (stat);
1900       return stat;
1901     }
1902 
1903   /* This point is reached if neither peeling nor versioning is being done.  */
1904   gcc_assert (! (do_peeling || do_versioning));
1905 
1906   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1907   return stat;
1908 }
1909 
1910 
1911 /* Function vect_find_same_alignment_drs.
1912 
1913    Update group and alignment relations according to the chosen
1914    vectorization factor.  */
1915 
1916 static void
1917 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1918 			      loop_vec_info loop_vinfo)
1919 {
1920   unsigned int i;
1921   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1923   struct data_reference *dra = DDR_A (ddr);
1924   struct data_reference *drb = DDR_B (ddr);
1925   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1926   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1927   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1928   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1929   lambda_vector dist_v;
1930   unsigned int loop_depth;
1931 
1932   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1933     return;
1934 
1935   if (dra == drb)
1936     return;
1937 
1938   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1939     return;
1940 
1941   /* Loop-based vectorization and known data dependence.  */
1942   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1943     return;
1944 
1945   /* Data-dependence analysis reports a distance vector of zero
1946      for data-references that overlap only in the first iteration
1947      but have different sign step (see PR45764).
1948      So as a sanity check require equal DR_STEP.  */
1949   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1950     return;
1951 
1952   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1953   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1954     {
1955       int dist = dist_v[loop_depth];
1956 
1957       if (dump_enabled_p ())
1958 	dump_printf_loc (MSG_NOTE, vect_location,
1959 	                 "dependence distance  = %d.\n", dist);
1960 
1961       /* Same loop iteration.  */
1962       if (dist == 0
1963 	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
1964 	{
1965 	  /* Two references with distance zero have the same alignment.  */
1966 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1967 	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1968 	  if (dump_enabled_p ())
1969 	    {
1970 	      dump_printf_loc (MSG_NOTE, vect_location,
1971 	                       "accesses have the same alignment.\n");
1972 	      dump_printf (MSG_NOTE,
1973 	                   "dependence distance modulo vf == 0 between ");
1974 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1975 	      dump_printf (MSG_NOTE,  " and ");
1976 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1977 	      dump_printf (MSG_NOTE, "\n");
1978 	    }
1979 	}
1980     }
1981 }
1982 
1983 
1984 /* Function vect_analyze_data_refs_alignment
1985 
1986    Analyze the alignment of the data-references in the loop.
1987    Return FALSE if a data reference is found that cannot be vectorized.  */
1988 
1989 bool
1990 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1991                                   bb_vec_info bb_vinfo)
1992 {
1993   if (dump_enabled_p ())
1994     dump_printf_loc (MSG_NOTE, vect_location,
1995                      "=== vect_analyze_data_refs_alignment ===\n");
1996 
1997   /* Mark groups of data references with same alignment using
1998      data dependence information.  */
1999   if (loop_vinfo)
2000     {
2001       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2002       struct data_dependence_relation *ddr;
2003       unsigned int i;
2004 
2005       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2006 	vect_find_same_alignment_drs (ddr, loop_vinfo);
2007     }
2008 
2009   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2010     {
2011       if (dump_enabled_p ())
2012 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013 	                 "not vectorized: can't calculate alignment "
2014 	                 "for data ref.\n");
2015       return false;
2016     }
2017 
2018   return true;
2019 }
2020 
2021 
2022 /* Analyze groups of accesses: check that DR belongs to a group of
2023    accesses of legal size, step, etc.  Detect gaps, single element
2024    interleaving, and other special cases. Set grouped access info.
2025    Collect groups of strided stores for further use in SLP analysis.  */
2026 
2027 static bool
2028 vect_analyze_group_access (struct data_reference *dr)
2029 {
2030   tree step = DR_STEP (dr);
2031   tree scalar_type = TREE_TYPE (DR_REF (dr));
2032   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2033   gimple stmt = DR_STMT (dr);
2034   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2035   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2036   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2037   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2038   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2039   bool slp_impossible = false;
2040   struct loop *loop = NULL;
2041 
2042   if (loop_vinfo)
2043     loop = LOOP_VINFO_LOOP (loop_vinfo);
2044 
2045   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2046      size of the interleaving group (including gaps).  */
2047   groupsize = absu_hwi (dr_step) / type_size;
2048 
2049   /* Not consecutive access is possible only if it is a part of interleaving.  */
2050   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2051     {
2052       /* Check if it this DR is a part of interleaving, and is a single
2053 	 element of the group that is accessed in the loop.  */
2054 
2055       /* Gaps are supported only for loads. STEP must be a multiple of the type
2056 	 size.  The size of the group must be a power of 2.  */
2057       if (DR_IS_READ (dr)
2058 	  && (dr_step % type_size) == 0
2059 	  && groupsize > 0
2060 	  && exact_log2 (groupsize) != -1)
2061 	{
2062 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2063 	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2064 	  if (dump_enabled_p ())
2065 	    {
2066 	      dump_printf_loc (MSG_NOTE, vect_location,
2067 	                       "Detected single element interleaving ");
2068 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2069 	      dump_printf (MSG_NOTE, " step ");
2070 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2071 	      dump_printf (MSG_NOTE, "\n");
2072 	    }
2073 
2074 	  if (loop_vinfo)
2075 	    {
2076 	      if (dump_enabled_p ())
2077 		dump_printf_loc (MSG_NOTE, vect_location,
2078 		                 "Data access with gaps requires scalar "
2079 		                 "epilogue loop\n");
2080               if (loop->inner)
2081                 {
2082                   if (dump_enabled_p ())
2083                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                                      "Peeling for outer loop is not"
2085                                      " supported\n");
2086                   return false;
2087                 }
2088 
2089               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2090 	    }
2091 
2092 	  return true;
2093 	}
2094 
2095       if (dump_enabled_p ())
2096         {
2097  	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 	                   "not consecutive access ");
2099 	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2100 	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2101         }
2102 
2103       if (bb_vinfo)
2104         {
2105           /* Mark the statement as unvectorizable.  */
2106           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2107           return true;
2108         }
2109 
2110       return false;
2111     }
2112 
2113   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2114     {
2115       /* First stmt in the interleaving chain. Check the chain.  */
2116       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2117       struct data_reference *data_ref = dr;
2118       unsigned int count = 1;
2119       tree prev_init = DR_INIT (data_ref);
2120       gimple prev = stmt;
2121       HOST_WIDE_INT diff, gaps = 0;
2122       unsigned HOST_WIDE_INT count_in_bytes;
2123 
2124       while (next)
2125         {
2126           /* Skip same data-refs.  In case that two or more stmts share
2127              data-ref (supported only for loads), we vectorize only the first
2128              stmt, and the rest get their vectorized loads from the first
2129              one.  */
2130           if (!tree_int_cst_compare (DR_INIT (data_ref),
2131                                      DR_INIT (STMT_VINFO_DATA_REF (
2132 						   vinfo_for_stmt (next)))))
2133             {
2134               if (DR_IS_WRITE (data_ref))
2135                 {
2136                   if (dump_enabled_p ())
2137                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138                                      "Two store stmts share the same dr.\n");
2139                   return false;
2140                 }
2141 
2142               /* For load use the same data-ref load.  */
2143               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2144 
2145               prev = next;
2146               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2147               continue;
2148             }
2149 
2150           prev = next;
2151           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2152 
2153 	  /* All group members have the same STEP by construction.  */
2154 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2155 
2156           /* Check that the distance between two accesses is equal to the type
2157              size. Otherwise, we have gaps.  */
2158           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2159                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2160 	  if (diff != 1)
2161 	    {
2162 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2163 	      slp_impossible = true;
2164 	      if (DR_IS_WRITE (data_ref))
2165 		{
2166                   if (dump_enabled_p ())
2167                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                                      "interleaved store with gaps\n");
2169 		  return false;
2170 		}
2171 
2172               gaps += diff - 1;
2173 	    }
2174 
2175 	  last_accessed_element += diff;
2176 
2177           /* Store the gap from the previous member of the group. If there is no
2178              gap in the access, GROUP_GAP is always 1.  */
2179           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2180 
2181           prev_init = DR_INIT (data_ref);
2182           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2183           /* Count the number of data-refs in the chain.  */
2184           count++;
2185         }
2186 
2187       /* COUNT is the number of accesses found, we multiply it by the size of
2188          the type to get COUNT_IN_BYTES.  */
2189       count_in_bytes = type_size * count;
2190 
2191       /* Check that the size of the interleaving (including gaps) is not
2192          greater than STEP.  */
2193       if (dr_step != 0
2194 	  && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2195         {
2196           if (dump_enabled_p ())
2197             {
2198               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199                                "interleaving size is greater than step for ");
2200               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2201                                  DR_REF (dr));
2202               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2203             }
2204           return false;
2205         }
2206 
2207       /* Check that the size of the interleaving is equal to STEP for stores,
2208          i.e., that there are no gaps.  */
2209       if (dr_step != 0
2210 	  && absu_hwi (dr_step) != count_in_bytes)
2211         {
2212           if (DR_IS_READ (dr))
2213             {
2214               slp_impossible = true;
2215               /* There is a gap after the last load in the group. This gap is a
2216                  difference between the groupsize and the number of elements.
2217 		 When there is no gap, this difference should be 0.  */
2218               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2219             }
2220           else
2221             {
2222               if (dump_enabled_p ())
2223                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                                  "interleaved store with gaps\n");
2225               return false;
2226             }
2227         }
2228 
2229       /* Check that STEP is a multiple of type size.  */
2230       if (dr_step != 0
2231 	  && (dr_step % type_size) != 0)
2232         {
2233           if (dump_enabled_p ())
2234             {
2235               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                                "step is not a multiple of type size: step ");
2237               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2238               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2239               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2240                                  TYPE_SIZE_UNIT (scalar_type));
2241               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2242             }
2243           return false;
2244         }
2245 
2246       if (groupsize == 0)
2247         groupsize = count;
2248 
2249       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2250       if (dump_enabled_p ())
2251         dump_printf_loc (MSG_NOTE, vect_location,
2252                          "Detected interleaving of size %d\n", (int)groupsize);
2253 
2254       /* SLP: create an SLP data structure for every interleaving group of
2255 	 stores for further analysis in vect_analyse_slp.  */
2256       if (DR_IS_WRITE (dr) && !slp_impossible)
2257         {
2258           if (loop_vinfo)
2259             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2260           if (bb_vinfo)
2261             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2262         }
2263 
2264       /* There is a gap in the end of the group.  */
2265       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2266 	{
2267 	  if (dump_enabled_p ())
2268 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 	                     "Data access with gaps requires scalar "
2270 	                     "epilogue loop\n");
2271           if (loop->inner)
2272             {
2273               if (dump_enabled_p ())
2274                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275                                  "Peeling for outer loop is not supported\n");
2276               return false;
2277             }
2278 
2279           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2280 	}
2281     }
2282 
2283   return true;
2284 }
2285 
2286 
2287 /* Analyze the access pattern of the data-reference DR.
2288    In case of non-consecutive accesses call vect_analyze_group_access() to
2289    analyze groups of accesses.  */
2290 
2291 static bool
2292 vect_analyze_data_ref_access (struct data_reference *dr)
2293 {
2294   tree step = DR_STEP (dr);
2295   tree scalar_type = TREE_TYPE (DR_REF (dr));
2296   gimple stmt = DR_STMT (dr);
2297   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2298   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2299   struct loop *loop = NULL;
2300 
2301   if (loop_vinfo)
2302     loop = LOOP_VINFO_LOOP (loop_vinfo);
2303 
2304   if (loop_vinfo && !step)
2305     {
2306       if (dump_enabled_p ())
2307 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2308 	                 "bad data-ref access in loop\n");
2309       return false;
2310     }
2311 
2312   /* Allow invariant loads in not nested loops.  */
2313   if (loop_vinfo && integer_zerop (step))
2314     {
2315       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2316       if (nested_in_vect_loop_p (loop, stmt))
2317 	{
2318 	  if (dump_enabled_p ())
2319 	    dump_printf_loc (MSG_NOTE, vect_location,
2320 			     "zero step in inner loop of nest\n");
2321 	  return false;
2322 	}
2323       return DR_IS_READ (dr);
2324     }
2325 
2326   if (loop && nested_in_vect_loop_p (loop, stmt))
2327     {
2328       /* Interleaved accesses are not yet supported within outer-loop
2329         vectorization for references in the inner-loop.  */
2330       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331 
2332       /* For the rest of the analysis we use the outer-loop step.  */
2333       step = STMT_VINFO_DR_STEP (stmt_info);
2334       if (integer_zerop (step))
2335 	{
2336 	  if (dump_enabled_p ())
2337 	    dump_printf_loc (MSG_NOTE, vect_location,
2338 	                     "zero step in outer loop.\n");
2339 	  if (DR_IS_READ (dr))
2340   	    return true;
2341 	  else
2342 	    return false;
2343 	}
2344     }
2345 
2346   /* Consecutive?  */
2347   if (TREE_CODE (step) == INTEGER_CST)
2348     {
2349       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2350       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2351 	  || (dr_step < 0
2352 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2353 	{
2354 	  /* Mark that it is not interleaving.  */
2355 	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2356 	  return true;
2357 	}
2358     }
2359 
2360   if (loop && nested_in_vect_loop_p (loop, stmt))
2361     {
2362       if (dump_enabled_p ())
2363 	dump_printf_loc (MSG_NOTE, vect_location,
2364 	                 "grouped access in outer loop.\n");
2365       return false;
2366     }
2367 
2368   /* Assume this is a DR handled by non-constant strided load case.  */
2369   if (TREE_CODE (step) != INTEGER_CST)
2370     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2371 
2372   /* Not consecutive access - check if it's a part of interleaving group.  */
2373   return vect_analyze_group_access (dr);
2374 }
2375 
2376 
2377 
2378 /*  A helper function used in the comparator function to sort data
2379     references.  T1 and T2 are two data references to be compared.
2380     The function returns -1, 0, or 1.  */
2381 
2382 static int
2383 compare_tree (tree t1, tree t2)
2384 {
2385   int i, cmp;
2386   enum tree_code code;
2387   char tclass;
2388 
2389   if (t1 == t2)
2390     return 0;
2391   if (t1 == NULL)
2392     return -1;
2393   if (t2 == NULL)
2394     return 1;
2395 
2396   STRIP_NOPS (t1);
2397   STRIP_NOPS (t2);
2398 
2399   if (TREE_CODE (t1) != TREE_CODE (t2))
2400     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2401 
2402   code = TREE_CODE (t1);
2403   switch (code)
2404     {
2405     /* For const values, we can just use hash values for comparisons.  */
2406     case INTEGER_CST:
2407     case REAL_CST:
2408     case FIXED_CST:
2409     case STRING_CST:
2410     case COMPLEX_CST:
2411     case VECTOR_CST:
2412       {
2413 	hashval_t h1 = iterative_hash_expr (t1, 0);
2414 	hashval_t h2 = iterative_hash_expr (t2, 0);
2415 	if (h1 != h2)
2416 	  return h1 < h2 ? -1 : 1;
2417 	break;
2418       }
2419 
2420     case SSA_NAME:
2421       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2422       if (cmp != 0)
2423 	return cmp;
2424 
2425       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2426 	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2427       break;
2428 
2429     default:
2430       tclass = TREE_CODE_CLASS (code);
2431 
2432       /* For var-decl, we could compare their UIDs.  */
2433       if (tclass == tcc_declaration)
2434 	{
2435 	  if (DECL_UID (t1) != DECL_UID (t2))
2436 	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2437 	  break;
2438 	}
2439 
2440       /* For expressions with operands, compare their operands recursively.  */
2441       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2442 	{
2443 	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2444 	  if (cmp != 0)
2445 	    return cmp;
2446 	}
2447     }
2448 
2449   return 0;
2450 }
2451 
2452 
2453 /* Compare two data-references DRA and DRB to group them into chunks
2454    suitable for grouping.  */
2455 
2456 static int
2457 dr_group_sort_cmp (const void *dra_, const void *drb_)
2458 {
2459   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2460   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2461   int cmp;
2462 
2463   /* Stabilize sort.  */
2464   if (dra == drb)
2465     return 0;
2466 
2467   /* Ordering of DRs according to base.  */
2468   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2469     {
2470       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2471       if (cmp != 0)
2472         return cmp;
2473     }
2474 
2475   /* And according to DR_OFFSET.  */
2476   if (!dr_equal_offsets_p (dra, drb))
2477     {
2478       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2479       if (cmp != 0)
2480         return cmp;
2481     }
2482 
2483   /* Put reads before writes.  */
2484   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2485     return DR_IS_READ (dra) ? -1 : 1;
2486 
2487   /* Then sort after access size.  */
2488   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2489 			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2490     {
2491       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2492                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2493       if (cmp != 0)
2494         return cmp;
2495     }
2496 
2497   /* And after step.  */
2498   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2499     {
2500       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2501       if (cmp != 0)
2502         return cmp;
2503     }
2504 
2505   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2506   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2507   if (cmp == 0)
2508     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2509   return cmp;
2510 }
2511 
2512 /* Function vect_analyze_data_ref_accesses.
2513 
2514    Analyze the access pattern of all the data references in the loop.
2515 
2516    FORNOW: the only access pattern that is considered vectorizable is a
2517 	   simple step 1 (consecutive) access.
2518 
2519    FORNOW: handle only arrays and pointer accesses.  */
2520 
2521 bool
2522 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2523 {
2524   unsigned int i;
2525   vec<data_reference_p> datarefs;
2526   struct data_reference *dr;
2527 
2528   if (dump_enabled_p ())
2529     dump_printf_loc (MSG_NOTE, vect_location,
2530                      "=== vect_analyze_data_ref_accesses ===\n");
2531 
2532   if (loop_vinfo)
2533     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2534   else
2535     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2536 
2537   if (datarefs.is_empty ())
2538     return true;
2539 
2540   /* Sort the array of datarefs to make building the interleaving chains
2541      linear.  Don't modify the original vector's order, it is needed for
2542      determining what dependencies are reversed.  */
2543   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2544   datarefs_copy.qsort (dr_group_sort_cmp);
2545 
2546   /* Build the interleaving chains.  */
2547   for (i = 0; i < datarefs_copy.length () - 1;)
2548     {
2549       data_reference_p dra = datarefs_copy[i];
2550       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2551       stmt_vec_info lastinfo = NULL;
2552       for (i = i + 1; i < datarefs_copy.length (); ++i)
2553 	{
2554 	  data_reference_p drb = datarefs_copy[i];
2555 	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2556 
2557 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2558 	     accesses, same accesses) can lead to a group to be artificially
2559 	     split here as we don't just skip over those.  If it really
2560 	     matters we can push those to a worklist and re-iterate
2561 	     over them.  The we can just skip ahead to the next DR here.  */
2562 
2563 	  /* Check that the data-refs have same first location (except init)
2564 	     and they are both either store or load (not load and store,
2565 	     not masked loads or stores).  */
2566 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2567 	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2568 				   DR_BASE_ADDRESS (drb), 0)
2569 	      || !dr_equal_offsets_p (dra, drb)
2570 	      || !gimple_assign_single_p (DR_STMT (dra))
2571 	      || !gimple_assign_single_p (DR_STMT (drb)))
2572 	    break;
2573 
2574 	  /* Check that the data-refs have the same constant size and step.  */
2575 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2576 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2577 	  if (!tree_fits_uhwi_p (sza)
2578 	      || !tree_fits_uhwi_p (szb)
2579 	      || !tree_int_cst_equal (sza, szb)
2580 	      || !tree_fits_shwi_p (DR_STEP (dra))
2581 	      || !tree_fits_shwi_p (DR_STEP (drb))
2582 	      || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2583 	    break;
2584 
2585 	  /* Do not place the same access in the interleaving chain twice.  */
2586 	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2587 	    break;
2588 
2589 	  /* Check the types are compatible.
2590 	     ???  We don't distinguish this during sorting.  */
2591 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2592 				   TREE_TYPE (DR_REF (drb))))
2593 	    break;
2594 
2595 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2596 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2597 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2598 	  gcc_assert (init_a <= init_b);
2599 
2600 	  /* If init_b == init_a + the size of the type * k, we have an
2601 	     interleaving, and DRA is accessed before DRB.  */
2602 	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2603 	  if ((init_b - init_a) % type_size_a != 0)
2604 	    break;
2605 
2606 	  /* The step (if not zero) is greater than the difference between
2607 	     data-refs' inits.  This splits groups into suitable sizes.  */
2608 	  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2609 	  if (step != 0 && step <= (init_b - init_a))
2610 	    break;
2611 
2612 	  if (dump_enabled_p ())
2613 	    {
2614 	      dump_printf_loc (MSG_NOTE, vect_location,
2615 			       "Detected interleaving ");
2616 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2617 	      dump_printf (MSG_NOTE,  " and ");
2618 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2619 	      dump_printf (MSG_NOTE, "\n");
2620 	    }
2621 
2622 	  /* Link the found element into the group list.  */
2623 	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2624 	    {
2625 	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2626 	      lastinfo = stmtinfo_a;
2627 	    }
2628 	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2629 	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2630 	  lastinfo = stmtinfo_b;
2631 	}
2632     }
2633 
2634   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2635     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2636         && !vect_analyze_data_ref_access (dr))
2637       {
2638 	if (dump_enabled_p ())
2639 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2640 	                   "not vectorized: complicated access pattern.\n");
2641 
2642         if (bb_vinfo)
2643           {
2644             /* Mark the statement as not vectorizable.  */
2645             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2646             continue;
2647           }
2648         else
2649 	  {
2650 	    datarefs_copy.release ();
2651 	    return false;
2652 	  }
2653       }
2654 
2655   datarefs_copy.release ();
2656   return true;
2657 }
2658 
2659 
2660 /* Operator == between two dr_with_seg_len objects.
2661 
2662    This equality operator is used to make sure two data refs
2663    are the same one so that we will consider to combine the
2664    aliasing checks of those two pairs of data dependent data
2665    refs.  */
2666 
2667 static bool
2668 operator == (const dr_with_seg_len& d1,
2669 	     const dr_with_seg_len& d2)
2670 {
2671   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2672 			  DR_BASE_ADDRESS (d2.dr), 0)
2673 	   && compare_tree (d1.offset, d2.offset) == 0
2674 	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2675 }
2676 
2677 /* Function comp_dr_with_seg_len_pair.
2678 
2679    Comparison function for sorting objects of dr_with_seg_len_pair_t
2680    so that we can combine aliasing checks in one scan.  */
2681 
2682 static int
2683 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2684 {
2685   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2686   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2687 
2688   const dr_with_seg_len &p11 = p1->first,
2689 			&p12 = p1->second,
2690 			&p21 = p2->first,
2691 			&p22 = p2->second;
2692 
2693   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2694      if a and c have the same basic address snd step, and b and d have the same
2695      address and step.  Therefore, if any a&c or b&d don't have the same address
2696      and step, we don't care the order of those two pairs after sorting.  */
2697   int comp_res;
2698 
2699   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2700 				DR_BASE_ADDRESS (p21.dr))) != 0)
2701     return comp_res;
2702   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2703 				DR_BASE_ADDRESS (p22.dr))) != 0)
2704     return comp_res;
2705   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2706     return comp_res;
2707   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2708     return comp_res;
2709   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2710     return comp_res;
2711   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2712     return comp_res;
2713 
2714   return 0;
2715 }
2716 
2717 /* Function vect_vfa_segment_size.
2718 
2719    Create an expression that computes the size of segment
2720    that will be accessed for a data reference.  The functions takes into
2721    account that realignment loads may access one more vector.
2722 
2723    Input:
2724      DR: The data reference.
2725      LENGTH_FACTOR: segment length to consider.
2726 
2727    Return an expression whose value is the size of segment which will be
2728    accessed by DR.  */
2729 
2730 static tree
2731 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2732 {
2733   tree segment_length;
2734 
2735   if (integer_zerop (DR_STEP (dr)))
2736     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2737   else
2738     segment_length = size_binop (MULT_EXPR,
2739 				 fold_convert (sizetype, DR_STEP (dr)),
2740 				 fold_convert (sizetype, length_factor));
2741 
2742   if (vect_supportable_dr_alignment (dr, false)
2743 	== dr_explicit_realign_optimized)
2744     {
2745       tree vector_size = TYPE_SIZE_UNIT
2746 			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2747 
2748       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2749     }
2750   return segment_length;
2751 }
2752 
2753 /* Function vect_prune_runtime_alias_test_list.
2754 
2755    Prune a list of ddrs to be tested at run-time by versioning for alias.
2756    Merge several alias checks into one if possible.
2757    Return FALSE if resulting list of ddrs is longer then allowed by
2758    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2759 
2760 bool
2761 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2762 {
2763   vec<ddr_p> may_alias_ddrs =
2764     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2765   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2766     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2767   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2768   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2769 
2770   ddr_p ddr;
2771   unsigned int i;
2772   tree length_factor;
2773 
2774   if (dump_enabled_p ())
2775     dump_printf_loc (MSG_NOTE, vect_location,
2776                      "=== vect_prune_runtime_alias_test_list ===\n");
2777 
2778   if (may_alias_ddrs.is_empty ())
2779     return true;
2780 
2781   /* Basically, for each pair of dependent data refs store_ptr_0
2782      and load_ptr_0, we create an expression:
2783 
2784      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2785      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2786 
2787      for aliasing checks.  However, in some cases we can decrease
2788      the number of checks by combining two checks into one.  For
2789      example, suppose we have another pair of data refs store_ptr_0
2790      and load_ptr_1, and if the following condition is satisfied:
2791 
2792      load_ptr_0 < load_ptr_1  &&
2793      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2794 
2795      (this condition means, in each iteration of vectorized loop,
2796      the accessed memory of store_ptr_0 cannot be between the memory
2797      of load_ptr_0 and load_ptr_1.)
2798 
2799      we then can use only the following expression to finish the
2800      alising checks between store_ptr_0 & load_ptr_0 and
2801      store_ptr_0 & load_ptr_1:
2802 
2803      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2804      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2805 
2806      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2807      same basic address.  */
2808 
2809   comp_alias_ddrs.create (may_alias_ddrs.length ());
2810 
2811   /* First, we collect all data ref pairs for aliasing checks.  */
2812   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2813     {
2814       struct data_reference *dr_a, *dr_b;
2815       gimple dr_group_first_a, dr_group_first_b;
2816       tree segment_length_a, segment_length_b;
2817       gimple stmt_a, stmt_b;
2818 
2819       dr_a = DDR_A (ddr);
2820       stmt_a = DR_STMT (DDR_A (ddr));
2821       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2822       if (dr_group_first_a)
2823 	{
2824 	  stmt_a = dr_group_first_a;
2825 	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2826 	}
2827 
2828       dr_b = DDR_B (ddr);
2829       stmt_b = DR_STMT (DDR_B (ddr));
2830       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2831       if (dr_group_first_b)
2832 	{
2833 	  stmt_b = dr_group_first_b;
2834 	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2835 	}
2836 
2837       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2838 	length_factor = scalar_loop_iters;
2839       else
2840 	length_factor = size_int (vect_factor);
2841       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2842       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2843 
2844       dr_with_seg_len_pair_t dr_with_seg_len_pair
2845 	  (dr_with_seg_len (dr_a, segment_length_a),
2846 	   dr_with_seg_len (dr_b, segment_length_b));
2847 
2848       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2849 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2850 
2851       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2852     }
2853 
2854   /* Second, we sort the collected data ref pairs so that we can scan
2855      them once to combine all possible aliasing checks.  */
2856   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2857 
2858   /* Third, we scan the sorted dr pairs and check if we can combine
2859      alias checks of two neighbouring dr pairs.  */
2860   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2861     {
2862       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2863       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2864 		      *dr_b1 = &comp_alias_ddrs[i-1].second,
2865 		      *dr_a2 = &comp_alias_ddrs[i].first,
2866 		      *dr_b2 = &comp_alias_ddrs[i].second;
2867 
2868       /* Remove duplicate data ref pairs.  */
2869       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2870 	{
2871 	  if (dump_enabled_p ())
2872 	    {
2873 	      dump_printf_loc (MSG_NOTE, vect_location,
2874 			       "found equal ranges ");
2875 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2876 				 DR_REF (dr_a1->dr));
2877 	      dump_printf (MSG_NOTE,  ", ");
2878 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879 				 DR_REF (dr_b1->dr));
2880 	      dump_printf (MSG_NOTE,  " and ");
2881 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2882 				 DR_REF (dr_a2->dr));
2883 	      dump_printf (MSG_NOTE,  ", ");
2884 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2885 				 DR_REF (dr_b2->dr));
2886 	      dump_printf (MSG_NOTE, "\n");
2887 	    }
2888 
2889 	  comp_alias_ddrs.ordered_remove (i--);
2890 	  continue;
2891 	}
2892 
2893       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2894 	{
2895 	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2896 	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
2897 	  if (*dr_a1 == *dr_a2)
2898 	    {
2899 	      std::swap (dr_a1, dr_b1);
2900 	      std::swap (dr_a2, dr_b2);
2901 	    }
2902 
2903 	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2904 				DR_BASE_ADDRESS (dr_a2->dr),
2905 				0)
2906 	      || !tree_fits_shwi_p (dr_a1->offset)
2907 	      || !tree_fits_shwi_p (dr_a2->offset))
2908 	    continue;
2909 
2910 	  /* Make sure dr_a1 starts left of dr_a2.  */
2911 	  if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
2912 	    std::swap (*dr_a1, *dr_a2);
2913 
2914 	  unsigned HOST_WIDE_INT diff
2915 	    = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
2916 
2917 
2918 	  bool do_remove = false;
2919 
2920 	  /* If the left segment does not extend beyond the start of the
2921 	     right segment the new segment length is that of the right
2922 	     plus the segment distance.  */
2923 	  if (tree_fits_uhwi_p (dr_a1->seg_len)
2924 	      && compare_tree_int (dr_a1->seg_len, diff) <= 0)
2925 	    {
2926 	      dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
2927 					   size_int (diff));
2928 	      do_remove = true;
2929 	    }
2930 	  /* Generally the new segment length is the maximum of the
2931 	     left segment size and the right segment size plus the distance.
2932 	     ???  We can also build tree MAX_EXPR here but it's not clear this
2933 	     is profitable.  */
2934 	  else if (tree_fits_uhwi_p (dr_a1->seg_len)
2935 		   && tree_fits_uhwi_p (dr_a2->seg_len))
2936 	    {
2937 	      unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
2938 	      unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
2939 	      dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
2940 	      do_remove = true;
2941 	    }
2942 	  /* Now we check if the following condition is satisfied:
2943 
2944 	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2945 
2946 	     where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2947 	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2948 	     have to make a best estimation.  We can get the minimum value
2949 	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2950 	     then either of the following two conditions can guarantee the
2951 	     one above:
2952 
2953 	     1: DIFF <= MIN_SEG_LEN_B
2954 	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
2955 	  else
2956 	    {
2957 	      unsigned HOST_WIDE_INT min_seg_len_b
2958 		= (tree_fits_uhwi_p (dr_b1->seg_len)
2959 		   ? tree_to_uhwi (dr_b1->seg_len)
2960 		   : vect_factor);
2961 
2962 	      if (diff <= min_seg_len_b
2963 		  || (tree_fits_uhwi_p (dr_a1->seg_len)
2964 		      && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
2965 		{
2966 		  dr_a1->seg_len = size_binop (PLUS_EXPR,
2967 					       dr_a2->seg_len, size_int (diff));
2968 		  do_remove = true;
2969 		}
2970 	    }
2971 
2972 	  if (do_remove)
2973 	    {
2974 	      if (dump_enabled_p ())
2975 		{
2976 		  dump_printf_loc (MSG_NOTE, vect_location,
2977 				   "merging ranges for ");
2978 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
2979 		  dump_printf (MSG_NOTE,  ", ");
2980 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
2981 		  dump_printf (MSG_NOTE,  " and ");
2982 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
2983 		  dump_printf (MSG_NOTE,  ", ");
2984 		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
2985 		  dump_printf (MSG_NOTE, "\n");
2986 		}
2987 	      comp_alias_ddrs.ordered_remove (i--);
2988 	    }
2989 	}
2990     }
2991 
2992   dump_printf_loc (MSG_NOTE, vect_location,
2993 		   "improved number of alias checks from %d to %d\n",
2994 		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
2995   if ((int) comp_alias_ddrs.length () >
2996       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2997     return false;
2998 
2999   return true;
3000 }
3001 
3002 /* Check whether a non-affine read in stmt is suitable for gather load
3003    and if so, return a builtin decl for that operation.  */
3004 
3005 tree
3006 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
3007 		   tree *offp, int *scalep)
3008 {
3009   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3010   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3011   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3012   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3013   tree offtype = NULL_TREE;
3014   tree decl, base, off;
3015   machine_mode pmode;
3016   int punsignedp, pvolatilep;
3017 
3018   base = DR_REF (dr);
3019   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3020      see if we can use the def stmt of the address.  */
3021   if (is_gimple_call (stmt)
3022       && gimple_call_internal_p (stmt)
3023       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3024 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3025       && TREE_CODE (base) == MEM_REF
3026       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3027       && integer_zerop (TREE_OPERAND (base, 1))
3028       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3029     {
3030       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3031       if (is_gimple_assign (def_stmt)
3032 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3033 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3034     }
3035 
3036   /* The gather builtins need address of the form
3037      loop_invariant + vector * {1, 2, 4, 8}
3038      or
3039      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3040      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3041      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3042      multiplications and additions in it.  To get a vector, we need
3043      a single SSA_NAME that will be defined in the loop and will
3044      contain everything that is not loop invariant and that can be
3045      vectorized.  The following code attempts to find such a preexistng
3046      SSA_NAME OFF and put the loop invariants into a tree BASE
3047      that can be gimplified before the loop.  */
3048   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3049 			      &pmode, &punsignedp, &pvolatilep, false);
3050   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3051 
3052   if (TREE_CODE (base) == MEM_REF)
3053     {
3054       if (!integer_zerop (TREE_OPERAND (base, 1)))
3055 	{
3056 	  if (off == NULL_TREE)
3057 	    {
3058 	      offset_int moff = mem_ref_offset (base);
3059 	      off = wide_int_to_tree (sizetype, moff);
3060 	    }
3061 	  else
3062 	    off = size_binop (PLUS_EXPR, off,
3063 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3064 	}
3065       base = TREE_OPERAND (base, 0);
3066     }
3067   else
3068     base = build_fold_addr_expr (base);
3069 
3070   if (off == NULL_TREE)
3071     off = size_zero_node;
3072 
3073   /* If base is not loop invariant, either off is 0, then we start with just
3074      the constant offset in the loop invariant BASE and continue with base
3075      as OFF, otherwise give up.
3076      We could handle that case by gimplifying the addition of base + off
3077      into some SSA_NAME and use that as off, but for now punt.  */
3078   if (!expr_invariant_in_loop_p (loop, base))
3079     {
3080       if (!integer_zerop (off))
3081 	return NULL_TREE;
3082       off = base;
3083       base = size_int (pbitpos / BITS_PER_UNIT);
3084     }
3085   /* Otherwise put base + constant offset into the loop invariant BASE
3086      and continue with OFF.  */
3087   else
3088     {
3089       base = fold_convert (sizetype, base);
3090       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3091     }
3092 
3093   /* OFF at this point may be either a SSA_NAME or some tree expression
3094      from get_inner_reference.  Try to peel off loop invariants from it
3095      into BASE as long as possible.  */
3096   STRIP_NOPS (off);
3097   while (offtype == NULL_TREE)
3098     {
3099       enum tree_code code;
3100       tree op0, op1, add = NULL_TREE;
3101 
3102       if (TREE_CODE (off) == SSA_NAME)
3103 	{
3104 	  gimple def_stmt = SSA_NAME_DEF_STMT (off);
3105 
3106 	  if (expr_invariant_in_loop_p (loop, off))
3107 	    return NULL_TREE;
3108 
3109 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3110 	    break;
3111 
3112 	  op0 = gimple_assign_rhs1 (def_stmt);
3113 	  code = gimple_assign_rhs_code (def_stmt);
3114 	  op1 = gimple_assign_rhs2 (def_stmt);
3115 	}
3116       else
3117 	{
3118 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3119 	    return NULL_TREE;
3120 	  code = TREE_CODE (off);
3121 	  extract_ops_from_tree (off, &code, &op0, &op1);
3122 	}
3123       switch (code)
3124 	{
3125 	case POINTER_PLUS_EXPR:
3126 	case PLUS_EXPR:
3127 	  if (expr_invariant_in_loop_p (loop, op0))
3128 	    {
3129 	      add = op0;
3130 	      off = op1;
3131 	    do_add:
3132 	      add = fold_convert (sizetype, add);
3133 	      if (scale != 1)
3134 		add = size_binop (MULT_EXPR, add, size_int (scale));
3135 	      base = size_binop (PLUS_EXPR, base, add);
3136 	      continue;
3137 	    }
3138 	  if (expr_invariant_in_loop_p (loop, op1))
3139 	    {
3140 	      add = op1;
3141 	      off = op0;
3142 	      goto do_add;
3143 	    }
3144 	  break;
3145 	case MINUS_EXPR:
3146 	  if (expr_invariant_in_loop_p (loop, op1))
3147 	    {
3148 	      add = fold_convert (sizetype, op1);
3149 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3150 	      off = op0;
3151 	      goto do_add;
3152 	    }
3153 	  break;
3154 	case MULT_EXPR:
3155 	  if (scale == 1 && tree_fits_shwi_p (op1))
3156 	    {
3157 	      scale = tree_to_shwi (op1);
3158 	      off = op0;
3159 	      continue;
3160 	    }
3161 	  break;
3162 	case SSA_NAME:
3163 	  off = op0;
3164 	  continue;
3165 	CASE_CONVERT:
3166 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3167 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3168 	    break;
3169 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3170 	      == TYPE_PRECISION (TREE_TYPE (off)))
3171 	    {
3172 	      off = op0;
3173 	      continue;
3174 	    }
3175 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3176 	      < TYPE_PRECISION (TREE_TYPE (off)))
3177 	    {
3178 	      off = op0;
3179 	      offtype = TREE_TYPE (off);
3180 	      STRIP_NOPS (off);
3181 	      continue;
3182 	    }
3183 	  break;
3184 	default:
3185 	  break;
3186 	}
3187       break;
3188     }
3189 
3190   /* If at the end OFF still isn't a SSA_NAME or isn't
3191      defined in the loop, punt.  */
3192   if (TREE_CODE (off) != SSA_NAME
3193       || expr_invariant_in_loop_p (loop, off))
3194     return NULL_TREE;
3195 
3196   if (offtype == NULL_TREE)
3197     offtype = TREE_TYPE (off);
3198 
3199   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3200 					   offtype, scale);
3201   if (decl == NULL_TREE)
3202     return NULL_TREE;
3203 
3204   if (basep)
3205     *basep = base;
3206   if (offp)
3207     *offp = off;
3208   if (scalep)
3209     *scalep = scale;
3210   return decl;
3211 }
3212 
3213 /* Function vect_analyze_data_refs.
3214 
3215   Find all the data references in the loop or basic block.
3216 
3217    The general structure of the analysis of data refs in the vectorizer is as
3218    follows:
3219    1- vect_analyze_data_refs(loop/bb): call
3220       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3221       in the loop/bb and their dependences.
3222    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3223    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3224    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3225 
3226 */
3227 
3228 bool
3229 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3230 			bb_vec_info bb_vinfo,
3231 			int *min_vf, unsigned *n_stmts)
3232 {
3233   struct loop *loop = NULL;
3234   basic_block bb = NULL;
3235   unsigned int i;
3236   vec<data_reference_p> datarefs;
3237   struct data_reference *dr;
3238   tree scalar_type;
3239 
3240   if (dump_enabled_p ())
3241     dump_printf_loc (MSG_NOTE, vect_location,
3242                      "=== vect_analyze_data_refs ===\n");
3243 
3244   if (loop_vinfo)
3245     {
3246       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3247 
3248       loop = LOOP_VINFO_LOOP (loop_vinfo);
3249       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3250       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3251 	{
3252 	  if (dump_enabled_p ())
3253 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3254 	                     "not vectorized: loop contains function calls"
3255 	                     " or data references that cannot be analyzed\n");
3256 	  return false;
3257 	}
3258 
3259       for (i = 0; i < loop->num_nodes; i++)
3260 	{
3261 	  gimple_stmt_iterator gsi;
3262 
3263 	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3264 	    {
3265 	      gimple stmt = gsi_stmt (gsi);
3266 	      if (is_gimple_debug (stmt))
3267 		continue;
3268 	      ++*n_stmts;
3269 	      if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3270 		{
3271 		  if (is_gimple_call (stmt) && loop->safelen)
3272 		    {
3273 		      tree fndecl = gimple_call_fndecl (stmt), op;
3274 		      if (fndecl != NULL_TREE)
3275 			{
3276 			  struct cgraph_node *node = cgraph_node::get (fndecl);
3277 			  if (node != NULL && node->simd_clones != NULL)
3278 			    {
3279 			      unsigned int j, n = gimple_call_num_args (stmt);
3280 			      for (j = 0; j < n; j++)
3281 				{
3282 				  op = gimple_call_arg (stmt, j);
3283 				  if (DECL_P (op)
3284 				      || (REFERENCE_CLASS_P (op)
3285 					  && get_base_address (op)))
3286 				    break;
3287 				}
3288 			      op = gimple_call_lhs (stmt);
3289 			      /* Ignore #pragma omp declare simd functions
3290 				 if they don't have data references in the
3291 				 call stmt itself.  */
3292 			      if (j == n
3293 				  && !(op
3294 				       && (DECL_P (op)
3295 					   || (REFERENCE_CLASS_P (op)
3296 					       && get_base_address (op)))))
3297 				continue;
3298 			    }
3299 			}
3300 		    }
3301 		  LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3302 		  if (dump_enabled_p ())
3303 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3304 				     "not vectorized: loop contains function "
3305 				     "calls or data references that cannot "
3306 				     "be analyzed\n");
3307 		  return false;
3308 		}
3309 	    }
3310 	}
3311 
3312       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3313     }
3314   else
3315     {
3316       gimple_stmt_iterator gsi;
3317 
3318       bb = BB_VINFO_BB (bb_vinfo);
3319       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3320 	{
3321 	  gimple stmt = gsi_stmt (gsi);
3322 	  if (is_gimple_debug (stmt))
3323 	    continue;
3324 	  ++*n_stmts;
3325 	  if (!find_data_references_in_stmt (NULL, stmt,
3326 					     &BB_VINFO_DATAREFS (bb_vinfo)))
3327 	    {
3328 	      /* Mark the rest of the basic-block as unvectorizable.  */
3329 	      for (; !gsi_end_p (gsi); gsi_next (&gsi))
3330 		{
3331 		  stmt = gsi_stmt (gsi);
3332 		  STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3333 		}
3334 	      break;
3335 	    }
3336 	}
3337 
3338       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3339     }
3340 
3341   /* Go through the data-refs, check that the analysis succeeded.  Update
3342      pointer from stmt_vec_info struct to DR and vectype.  */
3343 
3344   FOR_EACH_VEC_ELT (datarefs, i, dr)
3345     {
3346       gimple stmt;
3347       stmt_vec_info stmt_info;
3348       tree base, offset, init;
3349       bool gather = false;
3350       bool simd_lane_access = false;
3351       int vf;
3352 
3353 again:
3354       if (!dr || !DR_REF (dr))
3355         {
3356           if (dump_enabled_p ())
3357 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3358 	                     "not vectorized: unhandled data-ref\n");
3359           return false;
3360         }
3361 
3362       stmt = DR_STMT (dr);
3363       stmt_info = vinfo_for_stmt (stmt);
3364 
3365       /* Discard clobbers from the dataref vector.  We will remove
3366          clobber stmts during vectorization.  */
3367       if (gimple_clobber_p (stmt))
3368 	{
3369 	  free_data_ref (dr);
3370 	  if (i == datarefs.length () - 1)
3371 	    {
3372 	      datarefs.pop ();
3373 	      break;
3374 	    }
3375 	  datarefs.ordered_remove (i);
3376 	  dr = datarefs[i];
3377 	  goto again;
3378 	}
3379 
3380       /* Check that analysis of the data-ref succeeded.  */
3381       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3382 	  || !DR_STEP (dr))
3383         {
3384 	  bool maybe_gather
3385 	    = DR_IS_READ (dr)
3386 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3387 	      && targetm.vectorize.builtin_gather != NULL;
3388 	  bool maybe_simd_lane_access
3389 	    = loop_vinfo && loop->simduid;
3390 
3391 	  /* If target supports vector gather loads, or if this might be
3392 	     a SIMD lane access, see if they can't be used.  */
3393 	  if (loop_vinfo
3394 	      && (maybe_gather || maybe_simd_lane_access)
3395 	      && !nested_in_vect_loop_p (loop, stmt))
3396 	    {
3397 	      struct data_reference *newdr
3398 		= create_data_ref (NULL, loop_containing_stmt (stmt),
3399 				   DR_REF (dr), stmt, true);
3400 	      gcc_assert (newdr != NULL && DR_REF (newdr));
3401 	      if (DR_BASE_ADDRESS (newdr)
3402 		  && DR_OFFSET (newdr)
3403 		  && DR_INIT (newdr)
3404 		  && DR_STEP (newdr)
3405 		  && integer_zerop (DR_STEP (newdr)))
3406 		{
3407 		  if (maybe_simd_lane_access)
3408 		    {
3409 		      tree off = DR_OFFSET (newdr);
3410 		      STRIP_NOPS (off);
3411 		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3412 			  && TREE_CODE (off) == MULT_EXPR
3413 			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3414 			{
3415 			  tree step = TREE_OPERAND (off, 1);
3416 			  off = TREE_OPERAND (off, 0);
3417 			  STRIP_NOPS (off);
3418 			  if (CONVERT_EXPR_P (off)
3419 			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3420 									  0)))
3421 				 < TYPE_PRECISION (TREE_TYPE (off)))
3422 			    off = TREE_OPERAND (off, 0);
3423 			  if (TREE_CODE (off) == SSA_NAME)
3424 			    {
3425 			      gimple def = SSA_NAME_DEF_STMT (off);
3426 			      tree reft = TREE_TYPE (DR_REF (newdr));
3427 			      if (is_gimple_call (def)
3428 				  && gimple_call_internal_p (def)
3429 				  && (gimple_call_internal_fn (def)
3430 				      == IFN_GOMP_SIMD_LANE))
3431 				{
3432 				  tree arg = gimple_call_arg (def, 0);
3433 				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3434 				  arg = SSA_NAME_VAR (arg);
3435 				  if (arg == loop->simduid
3436 				      /* For now.  */
3437 				      && tree_int_cst_equal
3438 					   (TYPE_SIZE_UNIT (reft),
3439 					    step))
3440 				    {
3441 				      DR_OFFSET (newdr) = ssize_int (0);
3442 				      DR_STEP (newdr) = step;
3443 				      DR_ALIGNED_TO (newdr)
3444 					= size_int (BIGGEST_ALIGNMENT);
3445 				      dr = newdr;
3446 				      simd_lane_access = true;
3447 				    }
3448 				}
3449 			    }
3450 			}
3451 		    }
3452 		  if (!simd_lane_access && maybe_gather)
3453 		    {
3454 		      dr = newdr;
3455 		      gather = true;
3456 		    }
3457 		}
3458 	      if (!gather && !simd_lane_access)
3459 		free_data_ref (newdr);
3460 	    }
3461 
3462 	  if (!gather && !simd_lane_access)
3463 	    {
3464 	      if (dump_enabled_p ())
3465 		{
3466 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467                                    "not vectorized: data ref analysis "
3468                                    "failed ");
3469 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3470                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3471 		}
3472 
3473 	      if (bb_vinfo)
3474 		break;
3475 
3476 	      return false;
3477 	    }
3478         }
3479 
3480       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3481         {
3482           if (dump_enabled_p ())
3483             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3484                              "not vectorized: base addr of dr is a "
3485                              "constant\n");
3486 
3487           if (bb_vinfo)
3488 	    break;
3489 
3490 	  if (gather || simd_lane_access)
3491 	    free_data_ref (dr);
3492 	  return false;
3493         }
3494 
3495       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3496         {
3497           if (dump_enabled_p ())
3498             {
3499               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3500                                "not vectorized: volatile type ");
3501               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3502               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3503             }
3504 
3505           if (bb_vinfo)
3506 	    break;
3507 
3508           return false;
3509         }
3510 
3511       if (stmt_can_throw_internal (stmt))
3512         {
3513           if (dump_enabled_p ())
3514             {
3515               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3516                                "not vectorized: statement can throw an "
3517                                "exception ");
3518               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3519               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3520             }
3521 
3522           if (bb_vinfo)
3523 	    break;
3524 
3525 	  if (gather || simd_lane_access)
3526 	    free_data_ref (dr);
3527           return false;
3528         }
3529 
3530       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3531 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3532 	{
3533           if (dump_enabled_p ())
3534             {
3535               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3536                                "not vectorized: statement is bitfield "
3537                                "access ");
3538               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3539               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3540             }
3541 
3542           if (bb_vinfo)
3543 	    break;
3544 
3545 	  if (gather || simd_lane_access)
3546 	    free_data_ref (dr);
3547           return false;
3548 	}
3549 
3550       base = unshare_expr (DR_BASE_ADDRESS (dr));
3551       offset = unshare_expr (DR_OFFSET (dr));
3552       init = unshare_expr (DR_INIT (dr));
3553 
3554       if (is_gimple_call (stmt)
3555 	  && (!gimple_call_internal_p (stmt)
3556 	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3557 		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3558 	{
3559 	  if (dump_enabled_p ())
3560 	    {
3561 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3562 	                       "not vectorized: dr in a call ");
3563 	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3564 	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3565 	    }
3566 
3567 	  if (bb_vinfo)
3568 	    break;
3569 
3570 	  if (gather || simd_lane_access)
3571 	    free_data_ref (dr);
3572 	  return false;
3573 	}
3574 
3575       /* Update DR field in stmt_vec_info struct.  */
3576 
3577       /* If the dataref is in an inner-loop of the loop that is considered for
3578 	 for vectorization, we also want to analyze the access relative to
3579 	 the outer-loop (DR contains information only relative to the
3580 	 inner-most enclosing loop).  We do that by building a reference to the
3581 	 first location accessed by the inner-loop, and analyze it relative to
3582 	 the outer-loop.  */
3583       if (loop && nested_in_vect_loop_p (loop, stmt))
3584 	{
3585 	  tree outer_step, outer_base, outer_init;
3586 	  HOST_WIDE_INT pbitsize, pbitpos;
3587 	  tree poffset;
3588 	  machine_mode pmode;
3589 	  int punsignedp, pvolatilep;
3590 	  affine_iv base_iv, offset_iv;
3591 	  tree dinit;
3592 
3593 	  /* Build a reference to the first location accessed by the
3594 	     inner-loop: *(BASE+INIT).  (The first location is actually
3595 	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3596           tree inner_base = build_fold_indirect_ref
3597                                 (fold_build_pointer_plus (base, init));
3598 
3599 	  if (dump_enabled_p ())
3600 	    {
3601 	      dump_printf_loc (MSG_NOTE, vect_location,
3602                                "analyze in outer-loop: ");
3603 	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3604 	      dump_printf (MSG_NOTE, "\n");
3605 	    }
3606 
3607 	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3608 		          &poffset, &pmode, &punsignedp, &pvolatilep, false);
3609 	  gcc_assert (outer_base != NULL_TREE);
3610 
3611 	  if (pbitpos % BITS_PER_UNIT != 0)
3612 	    {
3613 	      if (dump_enabled_p ())
3614 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615                                  "failed: bit offset alignment.\n");
3616 	      return false;
3617 	    }
3618 
3619 	  outer_base = build_fold_addr_expr (outer_base);
3620 	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3621                           &base_iv, false))
3622 	    {
3623 	      if (dump_enabled_p ())
3624 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3625                                  "failed: evolution of base is not affine.\n");
3626 	      return false;
3627 	    }
3628 
3629 	  if (offset)
3630 	    {
3631 	      if (poffset)
3632 		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3633                                        poffset);
3634 	      else
3635 		poffset = offset;
3636 	    }
3637 
3638 	  if (!poffset)
3639 	    {
3640 	      offset_iv.base = ssize_int (0);
3641 	      offset_iv.step = ssize_int (0);
3642 	    }
3643 	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3644                                &offset_iv, false))
3645 	    {
3646 	      if (dump_enabled_p ())
3647 	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648                                  "evolution of offset is not affine.\n");
3649 	      return false;
3650 	    }
3651 
3652 	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3653 	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3654 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3655 	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3656 	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3657 
3658 	  outer_step = size_binop (PLUS_EXPR,
3659 				fold_convert (ssizetype, base_iv.step),
3660 				fold_convert (ssizetype, offset_iv.step));
3661 
3662 	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3663 	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3664 	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3665 	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3666 	  STMT_VINFO_DR_OFFSET (stmt_info) =
3667 				fold_convert (ssizetype, offset_iv.base);
3668 	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3669 				size_int (highest_pow2_factor (offset_iv.base));
3670 
3671           if (dump_enabled_p ())
3672 	    {
3673 	      dump_printf_loc (MSG_NOTE, vect_location,
3674                                "\touter base_address: ");
3675 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3676                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3677 	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3678 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3679                                  STMT_VINFO_DR_OFFSET (stmt_info));
3680 	      dump_printf (MSG_NOTE,
3681                            "\n\touter constant offset from base address: ");
3682 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3683                                  STMT_VINFO_DR_INIT (stmt_info));
3684 	      dump_printf (MSG_NOTE, "\n\touter step: ");
3685 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3686                                  STMT_VINFO_DR_STEP (stmt_info));
3687 	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3688 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3689                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3690 	      dump_printf (MSG_NOTE, "\n");
3691 	    }
3692 	}
3693 
3694       if (STMT_VINFO_DATA_REF (stmt_info))
3695         {
3696           if (dump_enabled_p ())
3697             {
3698               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699                                "not vectorized: more than one data ref "
3700                                "in stmt: ");
3701               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3702               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3703             }
3704 
3705           if (bb_vinfo)
3706 	    break;
3707 
3708 	  if (gather || simd_lane_access)
3709 	    free_data_ref (dr);
3710           return false;
3711         }
3712 
3713       STMT_VINFO_DATA_REF (stmt_info) = dr;
3714       if (simd_lane_access)
3715 	{
3716 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3717 	  free_data_ref (datarefs[i]);
3718 	  datarefs[i] = dr;
3719 	}
3720 
3721       /* Set vectype for STMT.  */
3722       scalar_type = TREE_TYPE (DR_REF (dr));
3723       STMT_VINFO_VECTYPE (stmt_info)
3724 	= get_vectype_for_scalar_type (scalar_type);
3725       if (!STMT_VINFO_VECTYPE (stmt_info))
3726         {
3727           if (dump_enabled_p ())
3728             {
3729               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3730                                "not vectorized: no vectype for stmt: ");
3731               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3732               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3733               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3734                                  scalar_type);
3735               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3736             }
3737 
3738           if (bb_vinfo)
3739 	    break;
3740 
3741 	  if (gather || simd_lane_access)
3742 	    {
3743 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3744 	      if (gather)
3745 		free_data_ref (dr);
3746 	    }
3747 	  return false;
3748         }
3749       else
3750 	{
3751 	  if (dump_enabled_p ())
3752 	    {
3753 	      dump_printf_loc (MSG_NOTE, vect_location,
3754 			       "got vectype for stmt: ");
3755 	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3756 	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3757 				 STMT_VINFO_VECTYPE (stmt_info));
3758 	      dump_printf (MSG_NOTE, "\n");
3759 	    }
3760 	}
3761 
3762       /* Adjust the minimal vectorization factor according to the
3763 	 vector type.  */
3764       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3765       if (vf > *min_vf)
3766 	*min_vf = vf;
3767 
3768       if (gather)
3769 	{
3770 	  tree off;
3771 
3772 	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3773 	  if (gather
3774 	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3775 	    gather = false;
3776 	  if (!gather)
3777 	    {
3778 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3779 	      free_data_ref (dr);
3780 	      if (dump_enabled_p ())
3781 		{
3782 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3783                                    "not vectorized: not suitable for gather "
3784                                    "load ");
3785 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3786                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3787 		}
3788 	      return false;
3789 	    }
3790 
3791 	  datarefs[i] = dr;
3792 	  STMT_VINFO_GATHER_P (stmt_info) = true;
3793 	}
3794       else if (loop_vinfo
3795 	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3796 	{
3797 	  if (nested_in_vect_loop_p (loop, stmt)
3798 	      || !DR_IS_READ (dr))
3799 	    {
3800 	      if (dump_enabled_p ())
3801 		{
3802 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3803                                    "not vectorized: not suitable for strided "
3804                                    "load ");
3805 		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3806                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3807 		}
3808 	      return false;
3809 	    }
3810 	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3811 	}
3812     }
3813 
3814   /* If we stopped analysis at the first dataref we could not analyze
3815      when trying to vectorize a basic-block mark the rest of the datarefs
3816      as not vectorizable and truncate the vector of datarefs.  That
3817      avoids spending useless time in analyzing their dependence.  */
3818   if (i != datarefs.length ())
3819     {
3820       gcc_assert (bb_vinfo != NULL);
3821       for (unsigned j = i; j < datarefs.length (); ++j)
3822 	{
3823 	  data_reference_p dr = datarefs[j];
3824           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3825 	  free_data_ref (dr);
3826 	}
3827       datarefs.truncate (i);
3828     }
3829 
3830   return true;
3831 }
3832 
3833 
3834 /* Function vect_get_new_vect_var.
3835 
3836    Returns a name for a new variable.  The current naming scheme appends the
3837    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3838    the name of vectorizer generated variables, and appends that to NAME if
3839    provided.  */
3840 
3841 tree
3842 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3843 {
3844   const char *prefix;
3845   tree new_vect_var;
3846 
3847   switch (var_kind)
3848   {
3849   case vect_simple_var:
3850     prefix = "vect";
3851     break;
3852   case vect_scalar_var:
3853     prefix = "stmp";
3854     break;
3855   case vect_pointer_var:
3856     prefix = "vectp";
3857     break;
3858   default:
3859     gcc_unreachable ();
3860   }
3861 
3862   if (name)
3863     {
3864       char* tmp = concat (prefix, "_", name, NULL);
3865       new_vect_var = create_tmp_reg (type, tmp);
3866       free (tmp);
3867     }
3868   else
3869     new_vect_var = create_tmp_reg (type, prefix);
3870 
3871   return new_vect_var;
3872 }
3873 
3874 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3875 
3876 static void
3877 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3878 				  stmt_vec_info stmt_info)
3879 {
3880   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3881   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3882   int misalign = DR_MISALIGNMENT (dr);
3883   if (misalign == -1)
3884     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3885   else
3886     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3887 }
3888 
3889 /* Function vect_create_addr_base_for_vector_ref.
3890 
3891    Create an expression that computes the address of the first memory location
3892    that will be accessed for a data reference.
3893 
3894    Input:
3895    STMT: The statement containing the data reference.
3896    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3897    OFFSET: Optional. If supplied, it is be added to the initial address.
3898    LOOP:    Specify relative to which loop-nest should the address be computed.
3899             For example, when the dataref is in an inner-loop nested in an
3900 	    outer-loop that is now being vectorized, LOOP can be either the
3901 	    outer-loop, or the inner-loop.  The first memory location accessed
3902 	    by the following dataref ('in' points to short):
3903 
3904 		for (i=0; i<N; i++)
3905 		   for (j=0; j<M; j++)
3906 		     s += in[i+j]
3907 
3908 	    is as follows:
3909 	    if LOOP=i_loop:	&in		(relative to i_loop)
3910 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
3911    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3912 	    initial address.  Unlike OFFSET, which is number of elements to
3913 	    be added, BYTE_OFFSET is measured in bytes.
3914 
3915    Output:
3916    1. Return an SSA_NAME whose value is the address of the memory location of
3917       the first vector of the data reference.
3918    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3919       these statement(s) which define the returned SSA_NAME.
3920 
3921    FORNOW: We are only handling array accesses with step 1.  */
3922 
3923 tree
3924 vect_create_addr_base_for_vector_ref (gimple stmt,
3925 				      gimple_seq *new_stmt_list,
3926 				      tree offset,
3927 				      struct loop *loop,
3928 				      tree byte_offset)
3929 {
3930   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3931   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3932   tree data_ref_base;
3933   const char *base_name;
3934   tree addr_base;
3935   tree dest;
3936   gimple_seq seq = NULL;
3937   tree base_offset;
3938   tree init;
3939   tree vect_ptr_type;
3940   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3941   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3942 
3943   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3944     {
3945       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3946 
3947       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3948 
3949       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3950       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3951       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3952     }
3953   else
3954     {
3955       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3956       base_offset = unshare_expr (DR_OFFSET (dr));
3957       init = unshare_expr (DR_INIT (dr));
3958     }
3959 
3960   if (loop_vinfo)
3961     base_name = get_name (data_ref_base);
3962   else
3963     {
3964       base_offset = ssize_int (0);
3965       init = ssize_int (0);
3966       base_name = get_name (DR_REF (dr));
3967     }
3968 
3969   /* Create base_offset */
3970   base_offset = size_binop (PLUS_EXPR,
3971 			    fold_convert (sizetype, base_offset),
3972 			    fold_convert (sizetype, init));
3973 
3974   if (offset)
3975     {
3976       offset = fold_build2 (MULT_EXPR, sizetype,
3977 			    fold_convert (sizetype, offset), step);
3978       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3979 				 base_offset, offset);
3980     }
3981   if (byte_offset)
3982     {
3983       byte_offset = fold_convert (sizetype, byte_offset);
3984       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3985 				 base_offset, byte_offset);
3986     }
3987 
3988   /* base + base_offset */
3989   if (loop_vinfo)
3990     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3991   else
3992     {
3993       addr_base = build1 (ADDR_EXPR,
3994 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
3995 			  unshare_expr (DR_REF (dr)));
3996     }
3997 
3998   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3999   addr_base = fold_convert (vect_ptr_type, addr_base);
4000   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4001   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
4002   gimple_seq_add_seq (new_stmt_list, seq);
4003 
4004   if (DR_PTR_INFO (dr)
4005       && TREE_CODE (addr_base) == SSA_NAME)
4006     {
4007       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4008       if (offset || byte_offset)
4009 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4010     }
4011 
4012   if (dump_enabled_p ())
4013     {
4014       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4015       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4016       dump_printf (MSG_NOTE, "\n");
4017     }
4018 
4019   return addr_base;
4020 }
4021 
4022 
4023 /* Function vect_create_data_ref_ptr.
4024 
4025    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4026    location accessed in the loop by STMT, along with the def-use update
4027    chain to appropriately advance the pointer through the loop iterations.
4028    Also set aliasing information for the pointer.  This pointer is used by
4029    the callers to this function to create a memory reference expression for
4030    vector load/store access.
4031 
4032    Input:
4033    1. STMT: a stmt that references memory. Expected to be of the form
4034          GIMPLE_ASSIGN <name, data-ref> or
4035 	 GIMPLE_ASSIGN <data-ref, name>.
4036    2. AGGR_TYPE: the type of the reference, which should be either a vector
4037         or an array.
4038    3. AT_LOOP: the loop where the vector memref is to be created.
4039    4. OFFSET (optional): an offset to be added to the initial address accessed
4040         by the data-ref in STMT.
4041    5. BSI: location where the new stmts are to be placed if there is no loop
4042    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4043         pointing to the initial address.
4044    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4045 	to the initial address accessed by the data-ref in STMT.  This is
4046 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4047 	in bytes.
4048 
4049    Output:
4050    1. Declare a new ptr to vector_type, and have it point to the base of the
4051       data reference (initial addressed accessed by the data reference).
4052       For example, for vector of type V8HI, the following code is generated:
4053 
4054       v8hi *ap;
4055       ap = (v8hi *)initial_address;
4056 
4057       if OFFSET is not supplied:
4058          initial_address = &a[init];
4059       if OFFSET is supplied:
4060          initial_address = &a[init + OFFSET];
4061       if BYTE_OFFSET is supplied:
4062 	 initial_address = &a[init] + BYTE_OFFSET;
4063 
4064       Return the initial_address in INITIAL_ADDRESS.
4065 
4066    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4067       update the pointer in each iteration of the loop.
4068 
4069       Return the increment stmt that updates the pointer in PTR_INCR.
4070 
4071    3. Set INV_P to true if the access pattern of the data reference in the
4072       vectorized loop is invariant.  Set it to false otherwise.
4073 
4074    4. Return the pointer.  */
4075 
4076 tree
4077 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4078 			  tree offset, tree *initial_address,
4079 			  gimple_stmt_iterator *gsi, gimple *ptr_incr,
4080 			  bool only_init, bool *inv_p, tree byte_offset)
4081 {
4082   const char *base_name;
4083   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4084   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4085   struct loop *loop = NULL;
4086   bool nested_in_vect_loop = false;
4087   struct loop *containing_loop = NULL;
4088   tree aggr_ptr_type;
4089   tree aggr_ptr;
4090   tree new_temp;
4091   gimple vec_stmt;
4092   gimple_seq new_stmt_list = NULL;
4093   edge pe = NULL;
4094   basic_block new_bb;
4095   tree aggr_ptr_init;
4096   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4097   tree aptr;
4098   gimple_stmt_iterator incr_gsi;
4099   bool insert_after;
4100   tree indx_before_incr, indx_after_incr;
4101   gimple incr;
4102   tree step;
4103   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4104 
4105   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4106 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4107 
4108   if (loop_vinfo)
4109     {
4110       loop = LOOP_VINFO_LOOP (loop_vinfo);
4111       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4112       containing_loop = (gimple_bb (stmt))->loop_father;
4113       pe = loop_preheader_edge (loop);
4114     }
4115   else
4116     {
4117       gcc_assert (bb_vinfo);
4118       only_init = true;
4119       *ptr_incr = NULL;
4120     }
4121 
4122   /* Check the step (evolution) of the load in LOOP, and record
4123      whether it's invariant.  */
4124   if (nested_in_vect_loop)
4125     step = STMT_VINFO_DR_STEP (stmt_info);
4126   else
4127     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4128 
4129   if (integer_zerop (step))
4130     *inv_p = true;
4131   else
4132     *inv_p = false;
4133 
4134   /* Create an expression for the first address accessed by this load
4135      in LOOP.  */
4136   base_name = get_name (DR_BASE_ADDRESS (dr));
4137 
4138   if (dump_enabled_p ())
4139     {
4140       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4141       dump_printf_loc (MSG_NOTE, vect_location,
4142                        "create %s-pointer variable to type: ",
4143 		       get_tree_code_name (TREE_CODE (aggr_type)));
4144       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4145       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4146         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4147       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4148         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4149       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4150         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4151       else
4152         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4153       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4154       dump_printf (MSG_NOTE, "\n");
4155     }
4156 
4157   /* (1) Create the new aggregate-pointer variable.
4158      Vector and array types inherit the alias set of their component
4159      type by default so we need to use a ref-all pointer if the data
4160      reference does not conflict with the created aggregated data
4161      reference because it is not addressable.  */
4162   bool need_ref_all = false;
4163   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4164 			      get_alias_set (DR_REF (dr))))
4165     need_ref_all = true;
4166   /* Likewise for any of the data references in the stmt group.  */
4167   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4168     {
4169       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4170       do
4171 	{
4172 	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4173 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4174 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4175 				      get_alias_set (DR_REF (sdr))))
4176 	    {
4177 	      need_ref_all = true;
4178 	      break;
4179 	    }
4180 	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4181 	}
4182       while (orig_stmt);
4183     }
4184   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4185 					       need_ref_all);
4186   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4187 
4188 
4189   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4190      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4191      def-use update cycles for the pointer: one relative to the outer-loop
4192      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4193      to the inner-loop (which is the inner-most loop containing the dataref),
4194      and this is done be step (5) below.
4195 
4196      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4197      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4198      redundant.  Steps (3),(4) create the following:
4199 
4200 	vp0 = &base_addr;
4201 	LOOP:	vp1 = phi(vp0,vp2)
4202 		...
4203 		...
4204 		vp2 = vp1 + step
4205 		goto LOOP
4206 
4207      If there is an inner-loop nested in loop, then step (5) will also be
4208      applied, and an additional update in the inner-loop will be created:
4209 
4210 	vp0 = &base_addr;
4211 	LOOP:   vp1 = phi(vp0,vp2)
4212 		...
4213         inner:     vp3 = phi(vp1,vp4)
4214 	           vp4 = vp3 + inner_step
4215 	           if () goto inner
4216 		...
4217 		vp2 = vp1 + step
4218 		if () goto LOOP   */
4219 
4220   /* (2) Calculate the initial address of the aggregate-pointer, and set
4221      the aggregate-pointer to point to it before the loop.  */
4222 
4223   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4224 
4225   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4226 						   offset, loop, byte_offset);
4227   if (new_stmt_list)
4228     {
4229       if (pe)
4230         {
4231           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4232           gcc_assert (!new_bb);
4233         }
4234       else
4235         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4236     }
4237 
4238   *initial_address = new_temp;
4239 
4240   /* Create: p = (aggr_type *) initial_base  */
4241   if (TREE_CODE (new_temp) != SSA_NAME
4242       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4243     {
4244       vec_stmt = gimple_build_assign (aggr_ptr,
4245 				      fold_convert (aggr_ptr_type, new_temp));
4246       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4247       /* Copy the points-to information if it exists. */
4248       if (DR_PTR_INFO (dr))
4249 	vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4250       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4251       if (pe)
4252 	{
4253 	  new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4254 	  gcc_assert (!new_bb);
4255 	}
4256       else
4257 	gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4258     }
4259   else
4260     aggr_ptr_init = new_temp;
4261 
4262   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4263      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4264      inner-loop nested in LOOP (during outer-loop vectorization).  */
4265 
4266   /* No update in loop is required.  */
4267   if (only_init && (!loop_vinfo || at_loop == loop))
4268     aptr = aggr_ptr_init;
4269   else
4270     {
4271       /* The step of the aggregate pointer is the type size.  */
4272       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4273       /* One exception to the above is when the scalar step of the load in
4274 	 LOOP is zero. In this case the step here is also zero.  */
4275       if (*inv_p)
4276 	iv_step = size_zero_node;
4277       else if (tree_int_cst_sgn (step) == -1)
4278 	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4279 
4280       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4281 
4282       create_iv (aggr_ptr_init,
4283 		 fold_convert (aggr_ptr_type, iv_step),
4284 		 aggr_ptr, loop, &incr_gsi, insert_after,
4285 		 &indx_before_incr, &indx_after_incr);
4286       incr = gsi_stmt (incr_gsi);
4287       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4288 
4289       /* Copy the points-to information if it exists. */
4290       if (DR_PTR_INFO (dr))
4291 	{
4292 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4293 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4294 	}
4295       if (ptr_incr)
4296 	*ptr_incr = incr;
4297 
4298       aptr = indx_before_incr;
4299     }
4300 
4301   if (!nested_in_vect_loop || only_init)
4302     return aptr;
4303 
4304 
4305   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4306      nested in LOOP, if exists.  */
4307 
4308   gcc_assert (nested_in_vect_loop);
4309   if (!only_init)
4310     {
4311       standard_iv_increment_position (containing_loop, &incr_gsi,
4312 				      &insert_after);
4313       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4314 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4315 		 &indx_after_incr);
4316       incr = gsi_stmt (incr_gsi);
4317       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4318 
4319       /* Copy the points-to information if it exists. */
4320       if (DR_PTR_INFO (dr))
4321 	{
4322 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4323 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4324 	}
4325       if (ptr_incr)
4326 	*ptr_incr = incr;
4327 
4328       return indx_before_incr;
4329     }
4330   else
4331     gcc_unreachable ();
4332 }
4333 
4334 
4335 /* Function bump_vector_ptr
4336 
4337    Increment a pointer (to a vector type) by vector-size. If requested,
4338    i.e. if PTR-INCR is given, then also connect the new increment stmt
4339    to the existing def-use update-chain of the pointer, by modifying
4340    the PTR_INCR as illustrated below:
4341 
4342    The pointer def-use update-chain before this function:
4343                         DATAREF_PTR = phi (p_0, p_2)
4344                         ....
4345         PTR_INCR:       p_2 = DATAREF_PTR + step
4346 
4347    The pointer def-use update-chain after this function:
4348                         DATAREF_PTR = phi (p_0, p_2)
4349                         ....
4350                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4351                         ....
4352         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4353 
4354    Input:
4355    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4356                  in the loop.
4357    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4358 	      the loop.  The increment amount across iterations is expected
4359 	      to be vector_size.
4360    BSI - location where the new update stmt is to be placed.
4361    STMT - the original scalar memory-access stmt that is being vectorized.
4362    BUMP - optional. The offset by which to bump the pointer. If not given,
4363 	  the offset is assumed to be vector_size.
4364 
4365    Output: Return NEW_DATAREF_PTR as illustrated above.
4366 
4367 */
4368 
4369 tree
4370 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4371 		 gimple stmt, tree bump)
4372 {
4373   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4374   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4375   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4376   tree update = TYPE_SIZE_UNIT (vectype);
4377   gassign *incr_stmt;
4378   ssa_op_iter iter;
4379   use_operand_p use_p;
4380   tree new_dataref_ptr;
4381 
4382   if (bump)
4383     update = bump;
4384 
4385   new_dataref_ptr = copy_ssa_name (dataref_ptr);
4386   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4387 				   dataref_ptr, update);
4388   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4389 
4390   /* Copy the points-to information if it exists. */
4391   if (DR_PTR_INFO (dr))
4392     {
4393       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4394       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4395     }
4396 
4397   if (!ptr_incr)
4398     return new_dataref_ptr;
4399 
4400   /* Update the vector-pointer's cross-iteration increment.  */
4401   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4402     {
4403       tree use = USE_FROM_PTR (use_p);
4404 
4405       if (use == dataref_ptr)
4406         SET_USE (use_p, new_dataref_ptr);
4407       else
4408         gcc_assert (tree_int_cst_compare (use, update) == 0);
4409     }
4410 
4411   return new_dataref_ptr;
4412 }
4413 
4414 
4415 /* Function vect_create_destination_var.
4416 
4417    Create a new temporary of type VECTYPE.  */
4418 
4419 tree
4420 vect_create_destination_var (tree scalar_dest, tree vectype)
4421 {
4422   tree vec_dest;
4423   const char *name;
4424   char *new_name;
4425   tree type;
4426   enum vect_var_kind kind;
4427 
4428   kind = vectype ? vect_simple_var : vect_scalar_var;
4429   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4430 
4431   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4432 
4433   name = get_name (scalar_dest);
4434   if (name)
4435     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4436   else
4437     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4438   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4439   free (new_name);
4440 
4441   return vec_dest;
4442 }
4443 
4444 /* Function vect_grouped_store_supported.
4445 
4446    Returns TRUE if interleave high and interleave low permutations
4447    are supported, and FALSE otherwise.  */
4448 
4449 bool
4450 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4451 {
4452   machine_mode mode = TYPE_MODE (vectype);
4453 
4454   /* vect_permute_store_chain requires the group size to be equal to 3 or
4455      be a power of two.  */
4456   if (count != 3 && exact_log2 (count) == -1)
4457     {
4458       if (dump_enabled_p ())
4459 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4460 			 "the size of the group of accesses"
4461 			 " is not a power of 2 or not eqaul to 3\n");
4462       return false;
4463     }
4464 
4465   /* Check that the permutation is supported.  */
4466   if (VECTOR_MODE_P (mode))
4467     {
4468       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4469       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4470 
4471       if (count == 3)
4472 	{
4473 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
4474 	  unsigned int i, j;
4475 
4476 	  for (j = 0; j < 3; j++)
4477 	    {
4478 	      int nelt0 = ((3 - j) * nelt) % 3;
4479 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
4480 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
4481 	      for (i = 0; i < nelt; i++)
4482 		{
4483 		  if (3 * i + nelt0 < nelt)
4484 		    sel[3 * i + nelt0] = j0++;
4485 		  if (3 * i + nelt1 < nelt)
4486 		    sel[3 * i + nelt1] = nelt + j1++;
4487 		  if (3 * i + nelt2 < nelt)
4488 		    sel[3 * i + nelt2] = 0;
4489 		}
4490 	      if (!can_vec_perm_p (mode, false, sel))
4491 		{
4492 		  if (dump_enabled_p ())
4493 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4494 				 "permutaion op not supported by target.\n");
4495 		  return false;
4496 		}
4497 
4498 	      for (i = 0; i < nelt; i++)
4499 		{
4500 		  if (3 * i + nelt0 < nelt)
4501 		    sel[3 * i + nelt0] = 3 * i + nelt0;
4502 		  if (3 * i + nelt1 < nelt)
4503 		    sel[3 * i + nelt1] = 3 * i + nelt1;
4504 		  if (3 * i + nelt2 < nelt)
4505 		    sel[3 * i + nelt2] = nelt + j2++;
4506 		}
4507 	      if (!can_vec_perm_p (mode, false, sel))
4508 		{
4509 		  if (dump_enabled_p ())
4510 		    dump_printf (MSG_MISSED_OPTIMIZATION,
4511 				 "permutaion op not supported by target.\n");
4512 		  return false;
4513 		}
4514 	    }
4515 	  return true;
4516 	}
4517       else
4518 	{
4519 	  /* If length is not equal to 3 then only power of 2 is supported.  */
4520 	  gcc_assert (exact_log2 (count) != -1);
4521 
4522 	  for (i = 0; i < nelt / 2; i++)
4523 	    {
4524 	      sel[i * 2] = i;
4525 	      sel[i * 2 + 1] = i + nelt;
4526 	    }
4527 	    if (can_vec_perm_p (mode, false, sel))
4528 	      {
4529 		for (i = 0; i < nelt; i++)
4530 		  sel[i] += nelt / 2;
4531 		if (can_vec_perm_p (mode, false, sel))
4532 		  return true;
4533 	      }
4534 	}
4535     }
4536 
4537   if (dump_enabled_p ())
4538     dump_printf (MSG_MISSED_OPTIMIZATION,
4539 		 "permutaion op not supported by target.\n");
4540   return false;
4541 }
4542 
4543 
4544 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4545    type VECTYPE.  */
4546 
4547 bool
4548 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4549 {
4550   return vect_lanes_optab_supported_p ("vec_store_lanes",
4551 				       vec_store_lanes_optab,
4552 				       vectype, count);
4553 }
4554 
4555 
4556 /* Function vect_permute_store_chain.
4557 
4558    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4559    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4560    the data correctly for the stores.  Return the final references for stores
4561    in RESULT_CHAIN.
4562 
4563    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4564    The input is 4 vectors each containing 8 elements.  We assign a number to
4565    each element, the input sequence is:
4566 
4567    1st vec:   0  1  2  3  4  5  6  7
4568    2nd vec:   8  9 10 11 12 13 14 15
4569    3rd vec:  16 17 18 19 20 21 22 23
4570    4th vec:  24 25 26 27 28 29 30 31
4571 
4572    The output sequence should be:
4573 
4574    1st vec:  0  8 16 24  1  9 17 25
4575    2nd vec:  2 10 18 26  3 11 19 27
4576    3rd vec:  4 12 20 28  5 13 21 30
4577    4th vec:  6 14 22 30  7 15 23 31
4578 
4579    i.e., we interleave the contents of the four vectors in their order.
4580 
4581    We use interleave_high/low instructions to create such output.  The input of
4582    each interleave_high/low operation is two vectors:
4583    1st vec    2nd vec
4584    0 1 2 3    4 5 6 7
4585    the even elements of the result vector are obtained left-to-right from the
4586    high/low elements of the first vector.  The odd elements of the result are
4587    obtained left-to-right from the high/low elements of the second vector.
4588    The output of interleave_high will be:   0 4 1 5
4589    and of interleave_low:                   2 6 3 7
4590 
4591 
4592    The permutation is done in log LENGTH stages.  In each stage interleave_high
4593    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4594    where the first argument is taken from the first half of DR_CHAIN and the
4595    second argument from it's second half.
4596    In our example,
4597 
4598    I1: interleave_high (1st vec, 3rd vec)
4599    I2: interleave_low (1st vec, 3rd vec)
4600    I3: interleave_high (2nd vec, 4th vec)
4601    I4: interleave_low (2nd vec, 4th vec)
4602 
4603    The output for the first stage is:
4604 
4605    I1:  0 16  1 17  2 18  3 19
4606    I2:  4 20  5 21  6 22  7 23
4607    I3:  8 24  9 25 10 26 11 27
4608    I4: 12 28 13 29 14 30 15 31
4609 
4610    The output of the second stage, i.e. the final result is:
4611 
4612    I1:  0  8 16 24  1  9 17 25
4613    I2:  2 10 18 26  3 11 19 27
4614    I3:  4 12 20 28  5 13 21 30
4615    I4:  6 14 22 30  7 15 23 31.  */
4616 
4617 void
4618 vect_permute_store_chain (vec<tree> dr_chain,
4619 			  unsigned int length,
4620 			  gimple stmt,
4621 			  gimple_stmt_iterator *gsi,
4622 			  vec<tree> *result_chain)
4623 {
4624   tree vect1, vect2, high, low;
4625   gimple perm_stmt;
4626   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4627   tree perm_mask_low, perm_mask_high;
4628   tree data_ref;
4629   tree perm3_mask_low, perm3_mask_high;
4630   unsigned int i, n, log_length = exact_log2 (length);
4631   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4632   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4633 
4634   result_chain->quick_grow (length);
4635   memcpy (result_chain->address (), dr_chain.address (),
4636 	  length * sizeof (tree));
4637 
4638   if (length == 3)
4639     {
4640       unsigned int j0 = 0, j1 = 0, j2 = 0;
4641 
4642       for (j = 0; j < 3; j++)
4643         {
4644 	  int nelt0 = ((3 - j) * nelt) % 3;
4645 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
4646 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
4647 
4648 	  for (i = 0; i < nelt; i++)
4649 	    {
4650 	      if (3 * i + nelt0 < nelt)
4651 		sel[3 * i + nelt0] = j0++;
4652 	      if (3 * i + nelt1 < nelt)
4653 		sel[3 * i + nelt1] = nelt + j1++;
4654 	      if (3 * i + nelt2 < nelt)
4655 		sel[3 * i + nelt2] = 0;
4656 	    }
4657 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4658 
4659 	  for (i = 0; i < nelt; i++)
4660 	    {
4661 	      if (3 * i + nelt0 < nelt)
4662 		sel[3 * i + nelt0] = 3 * i + nelt0;
4663 	      if (3 * i + nelt1 < nelt)
4664 		sel[3 * i + nelt1] = 3 * i + nelt1;
4665 	      if (3 * i + nelt2 < nelt)
4666 		sel[3 * i + nelt2] = nelt + j2++;
4667 	    }
4668 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4669 
4670 	  vect1 = dr_chain[0];
4671 	  vect2 = dr_chain[1];
4672 
4673 	  /* Create interleaving stmt:
4674 	     low = VEC_PERM_EXPR <vect1, vect2,
4675 				  {j, nelt, *, j + 1, nelt + j + 1, *,
4676 				   j + 2, nelt + j + 2, *, ...}>  */
4677 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4678 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4679 					   vect2, perm3_mask_low);
4680 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4681 
4682 	  vect1 = data_ref;
4683 	  vect2 = dr_chain[2];
4684 	  /* Create interleaving stmt:
4685 	     low = VEC_PERM_EXPR <vect1, vect2,
4686 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
4687 				   6, 7, nelt + j + 2, ...}>  */
4688 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4689 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4690 					   vect2, perm3_mask_high);
4691 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4692 	  (*result_chain)[j] = data_ref;
4693 	}
4694     }
4695   else
4696     {
4697       /* If length is not equal to 3 then only power of 2 is supported.  */
4698       gcc_assert (exact_log2 (length) != -1);
4699 
4700       for (i = 0, n = nelt / 2; i < n; i++)
4701 	{
4702 	  sel[i * 2] = i;
4703 	  sel[i * 2 + 1] = i + nelt;
4704 	}
4705 	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4706 
4707 	for (i = 0; i < nelt; i++)
4708 	  sel[i] += nelt / 2;
4709 	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4710 
4711 	for (i = 0, n = log_length; i < n; i++)
4712 	  {
4713 	    for (j = 0; j < length/2; j++)
4714 	      {
4715 		vect1 = dr_chain[j];
4716 		vect2 = dr_chain[j+length/2];
4717 
4718 		/* Create interleaving stmt:
4719 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4720 							...}>  */
4721 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4722 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4723 						 vect2, perm_mask_high);
4724 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4725 		(*result_chain)[2*j] = high;
4726 
4727 		/* Create interleaving stmt:
4728 		   low = VEC_PERM_EXPR <vect1, vect2,
4729 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4730 					 ...}>  */
4731 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4732 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4733 						 vect2, perm_mask_low);
4734 		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4735 		(*result_chain)[2*j+1] = low;
4736 	      }
4737 	    memcpy (dr_chain.address (), result_chain->address (),
4738 		    length * sizeof (tree));
4739 	  }
4740     }
4741 }
4742 
4743 /* Function vect_setup_realignment
4744 
4745    This function is called when vectorizing an unaligned load using
4746    the dr_explicit_realign[_optimized] scheme.
4747    This function generates the following code at the loop prolog:
4748 
4749       p = initial_addr;
4750    x  msq_init = *(floor(p));   # prolog load
4751       realignment_token = call target_builtin;
4752     loop:
4753    x  msq = phi (msq_init, ---)
4754 
4755    The stmts marked with x are generated only for the case of
4756    dr_explicit_realign_optimized.
4757 
4758    The code above sets up a new (vector) pointer, pointing to the first
4759    location accessed by STMT, and a "floor-aligned" load using that pointer.
4760    It also generates code to compute the "realignment-token" (if the relevant
4761    target hook was defined), and creates a phi-node at the loop-header bb
4762    whose arguments are the result of the prolog-load (created by this
4763    function) and the result of a load that takes place in the loop (to be
4764    created by the caller to this function).
4765 
4766    For the case of dr_explicit_realign_optimized:
4767    The caller to this function uses the phi-result (msq) to create the
4768    realignment code inside the loop, and sets up the missing phi argument,
4769    as follows:
4770     loop:
4771       msq = phi (msq_init, lsq)
4772       lsq = *(floor(p'));        # load in loop
4773       result = realign_load (msq, lsq, realignment_token);
4774 
4775    For the case of dr_explicit_realign:
4776     loop:
4777       msq = *(floor(p)); 	# load in loop
4778       p' = p + (VS-1);
4779       lsq = *(floor(p'));	# load in loop
4780       result = realign_load (msq, lsq, realignment_token);
4781 
4782    Input:
4783    STMT - (scalar) load stmt to be vectorized. This load accesses
4784           a memory location that may be unaligned.
4785    BSI - place where new code is to be inserted.
4786    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4787 			      is used.
4788 
4789    Output:
4790    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4791                        target hook, if defined.
4792    Return value - the result of the loop-header phi node.  */
4793 
4794 tree
4795 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4796                         tree *realignment_token,
4797 			enum dr_alignment_support alignment_support_scheme,
4798 			tree init_addr,
4799 			struct loop **at_loop)
4800 {
4801   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4802   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4803   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4804   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4805   struct loop *loop = NULL;
4806   edge pe = NULL;
4807   tree scalar_dest = gimple_assign_lhs (stmt);
4808   tree vec_dest;
4809   gimple inc;
4810   tree ptr;
4811   tree data_ref;
4812   basic_block new_bb;
4813   tree msq_init = NULL_TREE;
4814   tree new_temp;
4815   gphi *phi_stmt;
4816   tree msq = NULL_TREE;
4817   gimple_seq stmts = NULL;
4818   bool inv_p;
4819   bool compute_in_loop = false;
4820   bool nested_in_vect_loop = false;
4821   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4822   struct loop *loop_for_initial_load = NULL;
4823 
4824   if (loop_vinfo)
4825     {
4826       loop = LOOP_VINFO_LOOP (loop_vinfo);
4827       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4828     }
4829 
4830   gcc_assert (alignment_support_scheme == dr_explicit_realign
4831 	      || alignment_support_scheme == dr_explicit_realign_optimized);
4832 
4833   /* We need to generate three things:
4834      1. the misalignment computation
4835      2. the extra vector load (for the optimized realignment scheme).
4836      3. the phi node for the two vectors from which the realignment is
4837       done (for the optimized realignment scheme).  */
4838 
4839   /* 1. Determine where to generate the misalignment computation.
4840 
4841      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4842      calculation will be generated by this function, outside the loop (in the
4843      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4844      caller, inside the loop.
4845 
4846      Background: If the misalignment remains fixed throughout the iterations of
4847      the loop, then both realignment schemes are applicable, and also the
4848      misalignment computation can be done outside LOOP.  This is because we are
4849      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4850      are a multiple of VS (the Vector Size), and therefore the misalignment in
4851      different vectorized LOOP iterations is always the same.
4852      The problem arises only if the memory access is in an inner-loop nested
4853      inside LOOP, which is now being vectorized using outer-loop vectorization.
4854      This is the only case when the misalignment of the memory access may not
4855      remain fixed throughout the iterations of the inner-loop (as explained in
4856      detail in vect_supportable_dr_alignment).  In this case, not only is the
4857      optimized realignment scheme not applicable, but also the misalignment
4858      computation (and generation of the realignment token that is passed to
4859      REALIGN_LOAD) have to be done inside the loop.
4860 
4861      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4862      or not, which in turn determines if the misalignment is computed inside
4863      the inner-loop, or outside LOOP.  */
4864 
4865   if (init_addr != NULL_TREE || !loop_vinfo)
4866     {
4867       compute_in_loop = true;
4868       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4869     }
4870 
4871 
4872   /* 2. Determine where to generate the extra vector load.
4873 
4874      For the optimized realignment scheme, instead of generating two vector
4875      loads in each iteration, we generate a single extra vector load in the
4876      preheader of the loop, and in each iteration reuse the result of the
4877      vector load from the previous iteration.  In case the memory access is in
4878      an inner-loop nested inside LOOP, which is now being vectorized using
4879      outer-loop vectorization, we need to determine whether this initial vector
4880      load should be generated at the preheader of the inner-loop, or can be
4881      generated at the preheader of LOOP.  If the memory access has no evolution
4882      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4883      to be generated inside LOOP (in the preheader of the inner-loop).  */
4884 
4885   if (nested_in_vect_loop)
4886     {
4887       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4888       bool invariant_in_outerloop =
4889             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4890       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4891     }
4892   else
4893     loop_for_initial_load = loop;
4894   if (at_loop)
4895     *at_loop = loop_for_initial_load;
4896 
4897   if (loop_for_initial_load)
4898     pe = loop_preheader_edge (loop_for_initial_load);
4899 
4900   /* 3. For the case of the optimized realignment, create the first vector
4901       load at the loop preheader.  */
4902 
4903   if (alignment_support_scheme == dr_explicit_realign_optimized)
4904     {
4905       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4906       gassign *new_stmt;
4907 
4908       gcc_assert (!compute_in_loop);
4909       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4910       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4911 				      NULL_TREE, &init_addr, NULL, &inc,
4912 				      true, &inv_p);
4913       new_temp = copy_ssa_name (ptr);
4914       new_stmt = gimple_build_assign
4915 		   (new_temp, BIT_AND_EXPR, ptr,
4916 		    build_int_cst (TREE_TYPE (ptr),
4917 				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4918       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4919       gcc_assert (!new_bb);
4920       data_ref
4921 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4922 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4923       new_stmt = gimple_build_assign (vec_dest, data_ref);
4924       new_temp = make_ssa_name (vec_dest, new_stmt);
4925       gimple_assign_set_lhs (new_stmt, new_temp);
4926       if (pe)
4927         {
4928           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4929           gcc_assert (!new_bb);
4930         }
4931       else
4932          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4933 
4934       msq_init = gimple_assign_lhs (new_stmt);
4935     }
4936 
4937   /* 4. Create realignment token using a target builtin, if available.
4938       It is done either inside the containing loop, or before LOOP (as
4939       determined above).  */
4940 
4941   if (targetm.vectorize.builtin_mask_for_load)
4942     {
4943       gcall *new_stmt;
4944       tree builtin_decl;
4945 
4946       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4947       if (!init_addr)
4948 	{
4949 	  /* Generate the INIT_ADDR computation outside LOOP.  */
4950 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4951 							NULL_TREE, loop);
4952           if (loop)
4953             {
4954    	      pe = loop_preheader_edge (loop);
4955 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4956 	      gcc_assert (!new_bb);
4957             }
4958           else
4959              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4960 	}
4961 
4962       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4963       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4964       vec_dest =
4965 	vect_create_destination_var (scalar_dest,
4966 				     gimple_call_return_type (new_stmt));
4967       new_temp = make_ssa_name (vec_dest, new_stmt);
4968       gimple_call_set_lhs (new_stmt, new_temp);
4969 
4970       if (compute_in_loop)
4971 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4972       else
4973 	{
4974 	  /* Generate the misalignment computation outside LOOP.  */
4975 	  pe = loop_preheader_edge (loop);
4976 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4977 	  gcc_assert (!new_bb);
4978 	}
4979 
4980       *realignment_token = gimple_call_lhs (new_stmt);
4981 
4982       /* The result of the CALL_EXPR to this builtin is determined from
4983          the value of the parameter and no global variables are touched
4984          which makes the builtin a "const" function.  Requiring the
4985          builtin to have the "const" attribute makes it unnecessary
4986          to call mark_call_clobbered.  */
4987       gcc_assert (TREE_READONLY (builtin_decl));
4988     }
4989 
4990   if (alignment_support_scheme == dr_explicit_realign)
4991     return msq;
4992 
4993   gcc_assert (!compute_in_loop);
4994   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4995 
4996 
4997   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4998 
4999   pe = loop_preheader_edge (containing_loop);
5000   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5001   msq = make_ssa_name (vec_dest);
5002   phi_stmt = create_phi_node (msq, containing_loop->header);
5003   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5004 
5005   return msq;
5006 }
5007 
5008 
5009 /* Function vect_grouped_load_supported.
5010 
5011    Returns TRUE if even and odd permutations are supported,
5012    and FALSE otherwise.  */
5013 
5014 bool
5015 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5016 {
5017   machine_mode mode = TYPE_MODE (vectype);
5018 
5019   /* vect_permute_load_chain requires the group size to be equal to 3 or
5020      be a power of two.  */
5021   if (count != 3 && exact_log2 (count) == -1)
5022     {
5023       if (dump_enabled_p ())
5024 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5025 			 "the size of the group of accesses"
5026 			 " is not a power of 2 or not equal to 3\n");
5027       return false;
5028     }
5029 
5030   /* Check that the permutation is supported.  */
5031   if (VECTOR_MODE_P (mode))
5032     {
5033       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5034       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5035 
5036       if (count == 3)
5037 	{
5038 	  unsigned int k;
5039 	  for (k = 0; k < 3; k++)
5040 	    {
5041 	      for (i = 0; i < nelt; i++)
5042 		if (3 * i + k < 2 * nelt)
5043 		  sel[i] = 3 * i + k;
5044 		else
5045 		  sel[i] = 0;
5046 	      if (!can_vec_perm_p (mode, false, sel))
5047 		{
5048 		  if (dump_enabled_p ())
5049 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050 				     "shuffle of 3 loads is not supported by"
5051 				     " target\n");
5052 		    return false;
5053 		}
5054 	      for (i = 0, j = 0; i < nelt; i++)
5055 		if (3 * i + k < 2 * nelt)
5056 		  sel[i] = i;
5057 		else
5058 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5059 	      if (!can_vec_perm_p (mode, false, sel))
5060 		{
5061 		  if (dump_enabled_p ())
5062 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5063 				     "shuffle of 3 loads is not supported by"
5064 				     " target\n");
5065 		  return false;
5066 		}
5067 	    }
5068 	  return true;
5069 	}
5070       else
5071 	{
5072 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5073 	  gcc_assert (exact_log2 (count) != -1);
5074 	  for (i = 0; i < nelt; i++)
5075 	    sel[i] = i * 2;
5076 	  if (can_vec_perm_p (mode, false, sel))
5077 	    {
5078 	      for (i = 0; i < nelt; i++)
5079 		sel[i] = i * 2 + 1;
5080 	      if (can_vec_perm_p (mode, false, sel))
5081 		return true;
5082 	    }
5083         }
5084     }
5085 
5086   if (dump_enabled_p ())
5087     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5088 		     "extract even/odd not supported by target\n");
5089   return false;
5090 }
5091 
5092 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5093    type VECTYPE.  */
5094 
5095 bool
5096 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5097 {
5098   return vect_lanes_optab_supported_p ("vec_load_lanes",
5099 				       vec_load_lanes_optab,
5100 				       vectype, count);
5101 }
5102 
5103 /* Function vect_permute_load_chain.
5104 
5105    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5106    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5107    the input data correctly.  Return the final references for loads in
5108    RESULT_CHAIN.
5109 
5110    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5111    The input is 4 vectors each containing 8 elements. We assign a number to each
5112    element, the input sequence is:
5113 
5114    1st vec:   0  1  2  3  4  5  6  7
5115    2nd vec:   8  9 10 11 12 13 14 15
5116    3rd vec:  16 17 18 19 20 21 22 23
5117    4th vec:  24 25 26 27 28 29 30 31
5118 
5119    The output sequence should be:
5120 
5121    1st vec:  0 4  8 12 16 20 24 28
5122    2nd vec:  1 5  9 13 17 21 25 29
5123    3rd vec:  2 6 10 14 18 22 26 30
5124    4th vec:  3 7 11 15 19 23 27 31
5125 
5126    i.e., the first output vector should contain the first elements of each
5127    interleaving group, etc.
5128 
5129    We use extract_even/odd instructions to create such output.  The input of
5130    each extract_even/odd operation is two vectors
5131    1st vec    2nd vec
5132    0 1 2 3    4 5 6 7
5133 
5134    and the output is the vector of extracted even/odd elements.  The output of
5135    extract_even will be:   0 2 4 6
5136    and of extract_odd:     1 3 5 7
5137 
5138 
5139    The permutation is done in log LENGTH stages.  In each stage extract_even
5140    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5141    their order.  In our example,
5142 
5143    E1: extract_even (1st vec, 2nd vec)
5144    E2: extract_odd (1st vec, 2nd vec)
5145    E3: extract_even (3rd vec, 4th vec)
5146    E4: extract_odd (3rd vec, 4th vec)
5147 
5148    The output for the first stage will be:
5149 
5150    E1:  0  2  4  6  8 10 12 14
5151    E2:  1  3  5  7  9 11 13 15
5152    E3: 16 18 20 22 24 26 28 30
5153    E4: 17 19 21 23 25 27 29 31
5154 
5155    In order to proceed and create the correct sequence for the next stage (or
5156    for the correct output, if the second stage is the last one, as in our
5157    example), we first put the output of extract_even operation and then the
5158    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5159    The input for the second stage is:
5160 
5161    1st vec (E1):  0  2  4  6  8 10 12 14
5162    2nd vec (E3): 16 18 20 22 24 26 28 30
5163    3rd vec (E2):  1  3  5  7  9 11 13 15
5164    4th vec (E4): 17 19 21 23 25 27 29 31
5165 
5166    The output of the second stage:
5167 
5168    E1: 0 4  8 12 16 20 24 28
5169    E2: 2 6 10 14 18 22 26 30
5170    E3: 1 5  9 13 17 21 25 29
5171    E4: 3 7 11 15 19 23 27 31
5172 
5173    And RESULT_CHAIN after reordering:
5174 
5175    1st vec (E1):  0 4  8 12 16 20 24 28
5176    2nd vec (E3):  1 5  9 13 17 21 25 29
5177    3rd vec (E2):  2 6 10 14 18 22 26 30
5178    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5179 
5180 static void
5181 vect_permute_load_chain (vec<tree> dr_chain,
5182 			 unsigned int length,
5183 			 gimple stmt,
5184 			 gimple_stmt_iterator *gsi,
5185 			 vec<tree> *result_chain)
5186 {
5187   tree data_ref, first_vect, second_vect;
5188   tree perm_mask_even, perm_mask_odd;
5189   tree perm3_mask_low, perm3_mask_high;
5190   gimple perm_stmt;
5191   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5192   unsigned int i, j, log_length = exact_log2 (length);
5193   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5194   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5195 
5196   result_chain->quick_grow (length);
5197   memcpy (result_chain->address (), dr_chain.address (),
5198 	  length * sizeof (tree));
5199 
5200   if (length == 3)
5201     {
5202       unsigned int k;
5203 
5204       for (k = 0; k < 3; k++)
5205 	{
5206 	  for (i = 0; i < nelt; i++)
5207 	    if (3 * i + k < 2 * nelt)
5208 	      sel[i] = 3 * i + k;
5209 	    else
5210 	      sel[i] = 0;
5211 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5212 
5213 	  for (i = 0, j = 0; i < nelt; i++)
5214 	    if (3 * i + k < 2 * nelt)
5215 	      sel[i] = i;
5216 	    else
5217 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5218 
5219 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5220 
5221 	  first_vect = dr_chain[0];
5222 	  second_vect = dr_chain[1];
5223 
5224 	  /* Create interleaving stmt (low part of):
5225 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5226 							     ...}>  */
5227 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5228 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5229 					   second_vect, perm3_mask_low);
5230 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5231 
5232 	  /* Create interleaving stmt (high part of):
5233 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5234 							      ...}>  */
5235 	  first_vect = data_ref;
5236 	  second_vect = dr_chain[2];
5237 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5238 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5239 					   second_vect, perm3_mask_high);
5240 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241 	  (*result_chain)[k] = data_ref;
5242 	}
5243     }
5244   else
5245     {
5246       /* If length is not equal to 3 then only power of 2 is supported.  */
5247       gcc_assert (exact_log2 (length) != -1);
5248 
5249       for (i = 0; i < nelt; ++i)
5250 	sel[i] = i * 2;
5251       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5252 
5253       for (i = 0; i < nelt; ++i)
5254 	sel[i] = i * 2 + 1;
5255       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5256 
5257       for (i = 0; i < log_length; i++)
5258 	{
5259 	  for (j = 0; j < length; j += 2)
5260 	    {
5261 	      first_vect = dr_chain[j];
5262 	      second_vect = dr_chain[j+1];
5263 
5264 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5265 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5266 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5267 					       first_vect, second_vect,
5268 					       perm_mask_even);
5269 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5270 	      (*result_chain)[j/2] = data_ref;
5271 
5272 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5273 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5274 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5275 					       first_vect, second_vect,
5276 					       perm_mask_odd);
5277 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5278 	      (*result_chain)[j/2+length/2] = data_ref;
5279 	    }
5280 	  memcpy (dr_chain.address (), result_chain->address (),
5281 		  length * sizeof (tree));
5282 	}
5283     }
5284 }
5285 
5286 /* Function vect_shift_permute_load_chain.
5287 
5288    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5289    sequence of stmts to reorder the input data accordingly.
5290    Return the final references for loads in RESULT_CHAIN.
5291    Return true if successed, false otherwise.
5292 
5293    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5294    The input is 3 vectors each containing 8 elements.  We assign a
5295    number to each element, the input sequence is:
5296 
5297    1st vec:   0  1  2  3  4  5  6  7
5298    2nd vec:   8  9 10 11 12 13 14 15
5299    3rd vec:  16 17 18 19 20 21 22 23
5300 
5301    The output sequence should be:
5302 
5303    1st vec:  0 3 6  9 12 15 18 21
5304    2nd vec:  1 4 7 10 13 16 19 22
5305    3rd vec:  2 5 8 11 14 17 20 23
5306 
5307    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5308 
5309    First we shuffle all 3 vectors to get correct elements order:
5310 
5311    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5312    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5313    3rd vec:  (16 19 22) (17 20 23) (18 21)
5314 
5315    Next we unite and shift vector 3 times:
5316 
5317    1st step:
5318      shift right by 6 the concatenation of:
5319      "1st vec" and  "2nd vec"
5320        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5321      "2nd vec" and  "3rd vec"
5322        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5323      "3rd vec" and  "1st vec"
5324        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5325 			     | New vectors                   |
5326 
5327      So that now new vectors are:
5328 
5329      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5330      2nd vec:  (10 13) (16 19 22) (17 20 23)
5331      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5332 
5333    2nd step:
5334      shift right by 5 the concatenation of:
5335      "1st vec" and  "3rd vec"
5336        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5337      "2nd vec" and  "1st vec"
5338        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5339      "3rd vec" and  "2nd vec"
5340        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5341 			  | New vectors                   |
5342 
5343      So that now new vectors are:
5344 
5345      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5346      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5347      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5348 
5349    3rd step:
5350      shift right by 5 the concatenation of:
5351      "1st vec" and  "1st vec"
5352        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5353      shift right by 3 the concatenation of:
5354      "2nd vec" and  "2nd vec"
5355                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5356 			  | New vectors                   |
5357 
5358      So that now all vectors are READY:
5359      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5360      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5361      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5362 
5363    This algorithm is faster than one in vect_permute_load_chain if:
5364      1.  "shift of a concatination" is faster than general permutation.
5365 	 This is usually so.
5366      2.  The TARGET machine can't execute vector instructions in parallel.
5367 	 This is because each step of the algorithm depends on previous.
5368 	 The algorithm in vect_permute_load_chain is much more parallel.
5369 
5370    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5371 */
5372 
5373 static bool
5374 vect_shift_permute_load_chain (vec<tree> dr_chain,
5375 			       unsigned int length,
5376 			       gimple stmt,
5377 			       gimple_stmt_iterator *gsi,
5378 			       vec<tree> *result_chain)
5379 {
5380   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5381   tree perm2_mask1, perm2_mask2, perm3_mask;
5382   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5383   gimple perm_stmt;
5384 
5385   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5386   unsigned int i;
5387   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5388   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5389   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5390   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5391 
5392   result_chain->quick_grow (length);
5393   memcpy (result_chain->address (), dr_chain.address (),
5394 	  length * sizeof (tree));
5395 
5396   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5397     {
5398       unsigned int j, log_length = exact_log2 (length);
5399       for (i = 0; i < nelt / 2; ++i)
5400 	sel[i] = i * 2;
5401       for (i = 0; i < nelt / 2; ++i)
5402 	sel[nelt / 2 + i] = i * 2 + 1;
5403       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5404 	{
5405 	  if (dump_enabled_p ())
5406 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5407 			     "shuffle of 2 fields structure is not \
5408 			      supported by target\n");
5409 	  return false;
5410 	}
5411       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5412 
5413       for (i = 0; i < nelt / 2; ++i)
5414 	sel[i] = i * 2 + 1;
5415       for (i = 0; i < nelt / 2; ++i)
5416 	sel[nelt / 2 + i] = i * 2;
5417       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5418 	{
5419 	  if (dump_enabled_p ())
5420 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 			     "shuffle of 2 fields structure is not \
5422 			      supported by target\n");
5423 	  return false;
5424 	}
5425       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5426 
5427       /* Generating permutation constant to shift all elements.
5428 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5429       for (i = 0; i < nelt; i++)
5430 	sel[i] = nelt / 2 + i;
5431       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5432 	{
5433 	  if (dump_enabled_p ())
5434 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5435 			     "shift permutation is not supported by target\n");
5436 	  return false;
5437 	}
5438       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5439 
5440       /* Generating permutation constant to select vector from 2.
5441 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5442       for (i = 0; i < nelt / 2; i++)
5443 	sel[i] = i;
5444       for (i = nelt / 2; i < nelt; i++)
5445 	sel[i] = nelt + i;
5446       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5447 	{
5448 	  if (dump_enabled_p ())
5449 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5450 			     "select is not supported by target\n");
5451 	  return false;
5452 	}
5453       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5454 
5455       for (i = 0; i < log_length; i++)
5456 	{
5457 	  for (j = 0; j < length; j += 2)
5458 	    {
5459 	      first_vect = dr_chain[j];
5460 	      second_vect = dr_chain[j + 1];
5461 
5462 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5463 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5464 					       first_vect, first_vect,
5465 					       perm2_mask1);
5466 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5467 	      vect[0] = data_ref;
5468 
5469 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5470 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5471 					       second_vect, second_vect,
5472 					       perm2_mask2);
5473 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5474 	      vect[1] = data_ref;
5475 
5476 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5477 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5478 					       vect[0], vect[1], shift1_mask);
5479 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5480 	      (*result_chain)[j/2 + length/2] = data_ref;
5481 
5482 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5483 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5484 					       vect[0], vect[1], select_mask);
5485 	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5486 	      (*result_chain)[j/2] = data_ref;
5487 	    }
5488 	  memcpy (dr_chain.address (), result_chain->address (),
5489 		  length * sizeof (tree));
5490 	}
5491       return true;
5492     }
5493   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5494     {
5495       unsigned int k = 0, l = 0;
5496 
5497       /* Generating permutation constant to get all elements in rigth order.
5498 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5499       for (i = 0; i < nelt; i++)
5500 	{
5501 	  if (3 * k + (l % 3) >= nelt)
5502 	    {
5503 	      k = 0;
5504 	      l += (3 - (nelt % 3));
5505 	    }
5506 	  sel[i] = 3 * k + (l % 3);
5507 	  k++;
5508 	}
5509       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5510 	{
5511 	  if (dump_enabled_p ())
5512 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513 			     "shuffle of 3 fields structure is not \
5514 			      supported by target\n");
5515 	  return false;
5516 	}
5517       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5518 
5519       /* Generating permutation constant to shift all elements.
5520 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5521       for (i = 0; i < nelt; i++)
5522 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5523       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5524 	{
5525 	  if (dump_enabled_p ())
5526 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527 			     "shift permutation is not supported by target\n");
5528 	  return false;
5529 	}
5530       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5531 
5532       /* Generating permutation constant to shift all elements.
5533 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5534       for (i = 0; i < nelt; i++)
5535 	sel[i] = 2 * (nelt / 3) + 1 + i;
5536       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5537 	{
5538 	  if (dump_enabled_p ())
5539 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5540 			     "shift permutation is not supported by target\n");
5541 	  return false;
5542 	}
5543       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5544 
5545       /* Generating permutation constant to shift all elements.
5546 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5547       for (i = 0; i < nelt; i++)
5548 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5549       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5550 	{
5551 	  if (dump_enabled_p ())
5552 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5553 			     "shift permutation is not supported by target\n");
5554 	  return false;
5555 	}
5556       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5557 
5558       /* Generating permutation constant to shift all elements.
5559 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5560       for (i = 0; i < nelt; i++)
5561 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5562       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5563 	{
5564 	  if (dump_enabled_p ())
5565 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5566 			     "shift permutation is not supported by target\n");
5567 	  return false;
5568 	}
5569       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5570 
5571       for (k = 0; k < 3; k++)
5572 	{
5573 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5574 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5575 					   dr_chain[k], dr_chain[k],
5576 					   perm3_mask);
5577 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5578 	  vect[k] = data_ref;
5579 	}
5580 
5581       for (k = 0; k < 3; k++)
5582 	{
5583 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5584 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585 					   vect[k % 3], vect[(k + 1) % 3],
5586 					   shift1_mask);
5587 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588 	  vect_shift[k] = data_ref;
5589 	}
5590 
5591       for (k = 0; k < 3; k++)
5592 	{
5593 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5594 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5595 					   vect_shift[(4 - k) % 3],
5596 					   vect_shift[(3 - k) % 3],
5597 					   shift2_mask);
5598 	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5599 	  vect[k] = data_ref;
5600 	}
5601 
5602       (*result_chain)[3 - (nelt % 3)] = vect[2];
5603 
5604       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5605       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5606 				       vect[0], shift3_mask);
5607       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5608       (*result_chain)[nelt % 3] = data_ref;
5609 
5610       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5611       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5612 				       vect[1], shift4_mask);
5613       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5614       (*result_chain)[0] = data_ref;
5615       return true;
5616     }
5617   return false;
5618 }
5619 
5620 /* Function vect_transform_grouped_load.
5621 
5622    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5623    to perform their permutation and ascribe the result vectorized statements to
5624    the scalar statements.
5625 */
5626 
5627 void
5628 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5629 			     gimple_stmt_iterator *gsi)
5630 {
5631   machine_mode mode;
5632   vec<tree> result_chain = vNULL;
5633 
5634   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5635      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5636      vectors, that are ready for vector computation.  */
5637   result_chain.create (size);
5638 
5639   /* If reassociation width for vector type is 2 or greater target machine can
5640      execute 2 or more vector instructions in parallel.  Otherwise try to
5641      get chain for loads group using vect_shift_permute_load_chain.  */
5642   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5643   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5644       || exact_log2 (size) != -1
5645       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5646 					 gsi, &result_chain))
5647     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5648   vect_record_grouped_load_vectors (stmt, result_chain);
5649   result_chain.release ();
5650 }
5651 
5652 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5653    generated as part of the vectorization of STMT.  Assign the statement
5654    for each vector to the associated scalar statement.  */
5655 
5656 void
5657 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5658 {
5659   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5660   gimple next_stmt, new_stmt;
5661   unsigned int i, gap_count;
5662   tree tmp_data_ref;
5663 
5664   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5665      Since we scan the chain starting from it's first node, their order
5666      corresponds the order of data-refs in RESULT_CHAIN.  */
5667   next_stmt = first_stmt;
5668   gap_count = 1;
5669   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5670     {
5671       if (!next_stmt)
5672 	break;
5673 
5674       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5675        code elimination pass later.  No need to check for the first stmt in
5676        the group, since it always exists.
5677        GROUP_GAP is the number of steps in elements from the previous
5678        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5679        correspond to the gaps.  */
5680       if (next_stmt != first_stmt
5681           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5682       {
5683         gap_count++;
5684         continue;
5685       }
5686 
5687       while (next_stmt)
5688         {
5689 	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5690 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5691 	     copies, and we put the new vector statement in the first available
5692 	     RELATED_STMT.  */
5693 	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5694 	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5695 	  else
5696             {
5697               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5698                 {
5699  	          gimple prev_stmt =
5700 		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5701 	          gimple rel_stmt =
5702 		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5703 	          while (rel_stmt)
5704 		    {
5705 		      prev_stmt = rel_stmt;
5706 		      rel_stmt =
5707                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5708 		    }
5709 
5710   	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5711                     new_stmt;
5712                 }
5713             }
5714 
5715 	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5716 	  gap_count = 1;
5717 	  /* If NEXT_STMT accesses the same DR as the previous statement,
5718 	     put the same TMP_DATA_REF as its vectorized statement; otherwise
5719 	     get the next data-ref from RESULT_CHAIN.  */
5720 	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5721 	    break;
5722         }
5723     }
5724 }
5725 
5726 /* Function vect_force_dr_alignment_p.
5727 
5728    Returns whether the alignment of a DECL can be forced to be aligned
5729    on ALIGNMENT bit boundary.  */
5730 
5731 bool
5732 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5733 {
5734   if (TREE_CODE (decl) != VAR_DECL)
5735     return false;
5736 
5737   if (decl_in_symtab_p (decl)
5738       && !symtab_node::get (decl)->can_increase_alignment_p ())
5739     return false;
5740 
5741   if (TREE_STATIC (decl))
5742     return (alignment <= MAX_OFILE_ALIGNMENT);
5743   else
5744     return (alignment <= MAX_STACK_ALIGNMENT);
5745 }
5746 
5747 
5748 /* Return whether the data reference DR is supported with respect to its
5749    alignment.
5750    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5751    it is aligned, i.e., check if it is possible to vectorize it with different
5752    alignment.  */
5753 
5754 enum dr_alignment_support
5755 vect_supportable_dr_alignment (struct data_reference *dr,
5756                                bool check_aligned_accesses)
5757 {
5758   gimple stmt = DR_STMT (dr);
5759   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5760   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5761   machine_mode mode = TYPE_MODE (vectype);
5762   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5763   struct loop *vect_loop = NULL;
5764   bool nested_in_vect_loop = false;
5765 
5766   if (aligned_access_p (dr) && !check_aligned_accesses)
5767     return dr_aligned;
5768 
5769   /* For now assume all conditional loads/stores support unaligned
5770      access without any special code.  */
5771   if (is_gimple_call (stmt)
5772       && gimple_call_internal_p (stmt)
5773       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5774 	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5775     return dr_unaligned_supported;
5776 
5777   if (loop_vinfo)
5778     {
5779       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5780       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5781     }
5782 
5783   /* Possibly unaligned access.  */
5784 
5785   /* We can choose between using the implicit realignment scheme (generating
5786      a misaligned_move stmt) and the explicit realignment scheme (generating
5787      aligned loads with a REALIGN_LOAD).  There are two variants to the
5788      explicit realignment scheme: optimized, and unoptimized.
5789      We can optimize the realignment only if the step between consecutive
5790      vector loads is equal to the vector size.  Since the vector memory
5791      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5792      is guaranteed that the misalignment amount remains the same throughout the
5793      execution of the vectorized loop.  Therefore, we can create the
5794      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5795      at the loop preheader.
5796 
5797      However, in the case of outer-loop vectorization, when vectorizing a
5798      memory access in the inner-loop nested within the LOOP that is now being
5799      vectorized, while it is guaranteed that the misalignment of the
5800      vectorized memory access will remain the same in different outer-loop
5801      iterations, it is *not* guaranteed that is will remain the same throughout
5802      the execution of the inner-loop.  This is because the inner-loop advances
5803      with the original scalar step (and not in steps of VS).  If the inner-loop
5804      step happens to be a multiple of VS, then the misalignment remains fixed
5805      and we can use the optimized realignment scheme.  For example:
5806 
5807       for (i=0; i<N; i++)
5808         for (j=0; j<M; j++)
5809           s += a[i+j];
5810 
5811      When vectorizing the i-loop in the above example, the step between
5812      consecutive vector loads is 1, and so the misalignment does not remain
5813      fixed across the execution of the inner-loop, and the realignment cannot
5814      be optimized (as illustrated in the following pseudo vectorized loop):
5815 
5816       for (i=0; i<N; i+=4)
5817         for (j=0; j<M; j++){
5818           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5819                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5820                          // (assuming that we start from an aligned address).
5821           }
5822 
5823      We therefore have to use the unoptimized realignment scheme:
5824 
5825       for (i=0; i<N; i+=4)
5826           for (j=k; j<M; j+=4)
5827           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5828                            // that the misalignment of the initial address is
5829                            // 0).
5830 
5831      The loop can then be vectorized as follows:
5832 
5833       for (k=0; k<4; k++){
5834         rt = get_realignment_token (&vp[k]);
5835         for (i=0; i<N; i+=4){
5836           v1 = vp[i+k];
5837           for (j=k; j<M; j+=4){
5838             v2 = vp[i+j+VS-1];
5839             va = REALIGN_LOAD <v1,v2,rt>;
5840             vs += va;
5841             v1 = v2;
5842           }
5843         }
5844     } */
5845 
5846   if (DR_IS_READ (dr))
5847     {
5848       bool is_packed = false;
5849       tree type = (TREE_TYPE (DR_REF (dr)));
5850 
5851       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5852 	  && (!targetm.vectorize.builtin_mask_for_load
5853 	      || targetm.vectorize.builtin_mask_for_load ()))
5854 	{
5855 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5856 	  if ((nested_in_vect_loop
5857 	       && (TREE_INT_CST_LOW (DR_STEP (dr))
5858 	 	   != GET_MODE_SIZE (TYPE_MODE (vectype))))
5859               || !loop_vinfo)
5860 	    return dr_explicit_realign;
5861 	  else
5862 	    return dr_explicit_realign_optimized;
5863 	}
5864       if (!known_alignment_for_access_p (dr))
5865 	is_packed = not_size_aligned (DR_REF (dr));
5866 
5867       if ((TYPE_USER_ALIGN (type) && !is_packed)
5868 	  || targetm.vectorize.
5869 	       support_vector_misalignment (mode, type,
5870 					    DR_MISALIGNMENT (dr), is_packed))
5871 	/* Can't software pipeline the loads, but can at least do them.  */
5872 	return dr_unaligned_supported;
5873     }
5874   else
5875     {
5876       bool is_packed = false;
5877       tree type = (TREE_TYPE (DR_REF (dr)));
5878 
5879       if (!known_alignment_for_access_p (dr))
5880 	is_packed = not_size_aligned (DR_REF (dr));
5881 
5882      if ((TYPE_USER_ALIGN (type) && !is_packed)
5883 	 || targetm.vectorize.
5884 	      support_vector_misalignment (mode, type,
5885 					   DR_MISALIGNMENT (dr), is_packed))
5886        return dr_unaligned_supported;
5887     }
5888 
5889   /* Unsupported.  */
5890   return dr_unaligned_unsupported;
5891 }
5892