xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-vect-data-refs.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 			      tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 	{
75 	  if (dump_enabled_p ())
76 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 			     "no array mode for %s[%wu]\n",
78 			     GET_MODE_NAME (mode), count);
79 	  return false;
80 	}
81     }
82 
83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84     {
85       if (dump_enabled_p ())
86 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87                          "cannot use %s<%s><%s>\n", name,
88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89       return false;
90     }
91 
92   if (dump_enabled_p ())
93     dump_printf_loc (MSG_NOTE, vect_location,
94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95                      GET_MODE_NAME (mode));
96 
97   return true;
98 }
99 
100 
101 /* Return the smallest scalar part of STMT_INFO.
102    This is used to determine the vectype of the stmt.  We generally set the
103    vectype according to the type of the result (lhs).  For stmts whose
104    result-type is different than the type of the arguments (e.g., demotion,
105    promotion), vectype will be reset appropriately (later).  Note that we have
106    to visit the smallest datatype in this function, because that determines the
107    VF.  If the smallest datatype in the loop is present only as the rhs of a
108    promotion operation - we'd miss it.
109    Such a case, where a variable of this datatype does not appear in the lhs
110    anywhere in the loop, can only occur if it's an invariant: e.g.:
111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
112    invariant motion.  However, we cannot rely on invariant motion to always
113    take invariants out of the loop, and so in the case of promotion we also
114    have to check the rhs.
115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116    types.  */
117 
118 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,tree scalar_type)119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121   HOST_WIDE_INT lhs, rhs;
122 
123   /* During the analysis phase, this function is called on arbitrary
124      statements that might not have scalar results.  */
125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126     return scalar_type;
127 
128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 
130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131   if (assign)
132     {
133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134       if (gimple_assign_cast_p (assign)
135 	  || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 	  || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 	  || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 	  || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 	  || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 	  || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 	  || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142 	{
143 	  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 
145 	  rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 	  if (rhs < lhs)
147 	    scalar_type = rhs_type;
148 	}
149     }
150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151     {
152       unsigned int i = 0;
153       if (gimple_call_internal_p (call))
154 	{
155 	  internal_fn ifn = gimple_call_internal_fn (call);
156 	  if (internal_load_fn_p (ifn))
157 	    /* For loads the LHS type does the trick.  */
158 	    i = ~0U;
159 	  else if (internal_store_fn_p (ifn))
160 	    {
161 	      /* For stores use the tyep of the stored value.  */
162 	      i = internal_fn_stored_value_index (ifn);
163 	      scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 	      i = ~0U;
165 	    }
166 	  else if (internal_fn_mask_index (ifn) == 0)
167 	    i = 1;
168 	}
169       if (i < gimple_call_num_args (call))
170 	{
171 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173 	    {
174 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 	      if (rhs < lhs)
176 		scalar_type = rhs_type;
177 	    }
178 	}
179     }
180 
181   return scalar_type;
182 }
183 
184 
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186    tested at run-time.  Return TRUE if DDR was successfully inserted.
187    Return false if versioning is not supported.  */
188 
189 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193 
194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195     return opt_result::failure_at (vect_location,
196 				   "will not create alias checks, as"
197 				   " --param vect-max-version-for-alias-checks"
198 				   " == 0\n");
199 
200   opt_result res
201     = runtime_alias_check_p (ddr, loop,
202 			     optimize_loop_nest_for_speed_p (loop));
203   if (!res)
204     return res;
205 
206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207   return opt_result::success ();
208 }
209 
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
211 
212 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216   for (unsigned int i = 0; i < checks.length(); ++i)
217     if (checks[i] == value)
218       return;
219 
220   if (dump_enabled_p ())
221     dump_printf_loc (MSG_NOTE, vect_location,
222 		     "need run-time check that %T is nonzero\n",
223 		     value);
224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226 
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229    DR_INFO_B.  At least one of the accesses is a write.  */
230 
231 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236 
237   /* Single statements are always kept in their original order.  */
238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240     return true;
241 
242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
243      emitted at the position of the first scalar load.
244      Stores in a group are emitted at the position of the last scalar store.
245      Compute that position and check whether the resulting order matches
246      the current one.  */
247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248   if (il_a)
249     {
250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 	     s = DR_GROUP_NEXT_ELEMENT (s))
253 	  il_a = get_later_stmt (il_a, s);
254       else /* DR_IS_READ */
255 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 	     s = DR_GROUP_NEXT_ELEMENT (s))
257 	  if (get_later_stmt (il_a, s) == il_a)
258 	    il_a = s;
259     }
260   else
261     il_a = stmtinfo_a;
262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263   if (il_b)
264     {
265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 	     s = DR_GROUP_NEXT_ELEMENT (s))
268 	  il_b = get_later_stmt (il_b, s);
269       else /* DR_IS_READ */
270 	for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 	     s = DR_GROUP_NEXT_ELEMENT (s))
272 	  if (get_later_stmt (il_b, s) == il_b)
273 	    il_b = s;
274     }
275   else
276     il_b = stmtinfo_b;
277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280 
281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283    distances.  These distances are conservatively correct but they don't
284    reflect a guaranteed dependence.
285 
286    Return true if this function does all the work necessary to avoid
287    an alias or false if the caller should use the dependence distances
288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
289    the depth of the loop described by LOOP_VINFO and the other arguments
290    are as for vect_analyze_data_ref_dependence.  */
291 
292 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 				       loop_vec_info loop_vinfo,
295 				       int loop_depth, unsigned int *max_vf)
296 {
297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299     {
300       int dist = dist_v[loop_depth];
301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302 	{
303 	  /* If the user asserted safelen >= DIST consecutive iterations
304 	     can be executed concurrently, assume independence.
305 
306 	     ??? An alternative would be to add the alias check even
307 	     in this case, and vectorize the fallback loop with the
308 	     maximum VF set to safelen.  However, if the user has
309 	     explicitly given a length, it's less likely that that
310 	     would be a win.  */
311 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312 	    {
313 	      if ((unsigned int) loop->safelen < *max_vf)
314 		*max_vf = loop->safelen;
315 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 	      continue;
317 	    }
318 
319 	  /* For dependence distances of 2 or more, we have the option
320 	     of limiting VF or checking for an alias at runtime.
321 	     Prefer to check at runtime if we can, to avoid limiting
322 	     the VF unnecessarily when the bases are in fact independent.
323 
324 	     Note that the alias checks will be removed if the VF ends up
325 	     being small enough.  */
326 	  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 	  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 	  return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 		  && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 		  && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 	}
332     }
333   return true;
334 }
335 
336 
337 /* Function vect_analyze_data_ref_dependence.
338 
339    FIXME: I needed to change the sense of the returned flag.
340 
341    Return FALSE if there (might) exist a dependence between a memory-reference
342    DRA and a memory-reference DRB.  When versioning for alias may check a
343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
344    the data dependence.  */
345 
346 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 				  loop_vec_info loop_vinfo,
349 				  unsigned int *max_vf)
350 {
351   unsigned int i;
352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353   struct data_reference *dra = DDR_A (ddr);
354   struct data_reference *drb = DDR_B (ddr);
355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359   lambda_vector dist_v;
360   unsigned int loop_depth;
361 
362   /* If user asserted safelen consecutive iterations can be
363      executed concurrently, assume independence.  */
364   auto apply_safelen = [&]()
365     {
366       if (loop->safelen >= 2)
367 	{
368 	  if ((unsigned int) loop->safelen < *max_vf)
369 	    *max_vf = loop->safelen;
370 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371 	  return true;
372 	}
373       return false;
374     };
375 
376   /* In loop analysis all data references should be vectorizable.  */
377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379     gcc_unreachable ();
380 
381   /* Independent data accesses.  */
382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383     return opt_result::success ();
384 
385   if (dra == drb
386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387     return opt_result::success ();
388 
389   /* We do not have to consider dependences between accesses that belong
390      to the same group, unless the stride could be smaller than the
391      group size.  */
392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 	  == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396     return opt_result::success ();
397 
398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
399      least two scalar iterations, there is always also a true dependence.
400      As the vectorizer does not re-order loads and stores we can ignore
401      the anti-dependence if TBAA can disambiguate both DRs similar to the
402      case with known negative distance anti-dependences (positive
403      distance anti-dependences would violate TBAA constraints).  */
404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 				 get_alias_set (DR_REF (drb))))
408     return opt_result::success ();
409 
410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412     {
413       if (apply_safelen ())
414 	return opt_result::success ();
415 
416       return opt_result::failure_at
417 	(stmtinfo_a->stmt,
418 	 "possible alias involving gather/scatter between %T and %T\n",
419 	 DR_REF (dra), DR_REF (drb));
420     }
421 
422   /* Unknown data dependence.  */
423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
424     {
425       if (apply_safelen ())
426 	return opt_result::success ();
427 
428       if (dump_enabled_p ())
429 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
430 			 "versioning for alias required: "
431 			 "can't determine dependence between %T and %T\n",
432 			 DR_REF (dra), DR_REF (drb));
433 
434       /* Add to list of ddrs that need to be tested at run-time.  */
435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436     }
437 
438   /* Known data dependence.  */
439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
440     {
441       if (apply_safelen ())
442 	return opt_result::success ();
443 
444       if (dump_enabled_p ())
445 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
446 			 "versioning for alias required: "
447 			 "bad dist vector for %T and %T\n",
448 			 DR_REF (dra), DR_REF (drb));
449       /* Add to list of ddrs that need to be tested at run-time.  */
450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451     }
452 
453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 
455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
457 						loop_depth, max_vf))
458     return opt_result::success ();
459 
460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461     {
462       int dist = dist_v[loop_depth];
463 
464       if (dump_enabled_p ())
465 	dump_printf_loc (MSG_NOTE, vect_location,
466                          "dependence distance  = %d.\n", dist);
467 
468       if (dist == 0)
469 	{
470 	  if (dump_enabled_p ())
471 	    dump_printf_loc (MSG_NOTE, vect_location,
472 			     "dependence distance == 0 between %T and %T\n",
473 			     DR_REF (dra), DR_REF (drb));
474 
475 	  /* When we perform grouped accesses and perform implicit CSE
476 	     by detecting equal accesses and doing disambiguation with
477 	     runtime alias tests like for
478 	        .. = a[i];
479 		.. = a[i+1];
480 		a[i] = ..;
481 		a[i+1] = ..;
482 		*p = ..;
483 		.. = a[i];
484 		.. = a[i+1];
485 	     where we will end up loading { a[i], a[i+1] } once, make
486 	     sure that inserting group loads before the first load and
487 	     stores after the last store will do the right thing.
488 	     Similar for groups like
489 	        a[i] = ...;
490 		... = a[i];
491 		a[i+1] = ...;
492 	     where loads from the group interleave with the store.  */
493 	  if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
494 	    return opt_result::failure_at (stmtinfo_a->stmt,
495 					   "READ_WRITE dependence"
496 					   " in interleaving.\n");
497 
498 	  if (loop->safelen < 2)
499 	    {
500 	      tree indicator = dr_zero_step_indicator (dra);
501 	      if (!indicator || integer_zerop (indicator))
502 		return opt_result::failure_at (stmtinfo_a->stmt,
503 					       "access also has a zero step\n");
504 	      else if (TREE_CODE (indicator) != INTEGER_CST)
505 		vect_check_nonzero_value (loop_vinfo, indicator);
506 	    }
507 	  continue;
508 	}
509 
510       if (dist > 0 && DDR_REVERSED_P (ddr))
511 	{
512 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
513 	     reversed (to make distance vector positive), and the actual
514 	     distance is negative.  */
515 	  if (dump_enabled_p ())
516 	    dump_printf_loc (MSG_NOTE, vect_location,
517 	                     "dependence distance negative.\n");
518 	  /* When doing outer loop vectorization, we need to check if there is
519 	     a backward dependence at the inner loop level if the dependence
520 	     at the outer loop is reversed.  See PR81740.  */
521 	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
522 	      || nested_in_vect_loop_p (loop, stmtinfo_b))
523 	    {
524 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
525 							 DDR_LOOP_NEST (ddr));
526 	      if (dist_v[inner_depth] < 0)
527 		return opt_result::failure_at (stmtinfo_a->stmt,
528 					       "not vectorized, dependence "
529 					       "between data-refs %T and %T\n",
530 					       DR_REF (dra), DR_REF (drb));
531 	    }
532 	  /* Record a negative dependence distance to later limit the
533 	     amount of stmt copying / unrolling we can perform.
534 	     Only need to handle read-after-write dependence.  */
535 	  if (DR_IS_READ (drb)
536 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
537 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
538 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
539 	  continue;
540 	}
541 
542       unsigned int abs_dist = abs (dist);
543       if (abs_dist >= 2 && abs_dist < *max_vf)
544 	{
545 	  /* The dependence distance requires reduction of the maximal
546 	     vectorization factor.  */
547 	  *max_vf = abs_dist;
548 	  if (dump_enabled_p ())
549 	    dump_printf_loc (MSG_NOTE, vect_location,
550 	                     "adjusting maximal vectorization factor to %i\n",
551 	                     *max_vf);
552 	}
553 
554       if (abs_dist >= *max_vf)
555 	{
556 	  /* Dependence distance does not create dependence, as far as
557 	     vectorization is concerned, in this case.  */
558 	  if (dump_enabled_p ())
559 	    dump_printf_loc (MSG_NOTE, vect_location,
560 	                     "dependence distance >= VF.\n");
561 	  continue;
562 	}
563 
564       return opt_result::failure_at (stmtinfo_a->stmt,
565 				     "not vectorized, possible dependence "
566 				     "between data-refs %T and %T\n",
567 				     DR_REF (dra), DR_REF (drb));
568     }
569 
570   return opt_result::success ();
571 }
572 
573 /* Function vect_analyze_data_ref_dependences.
574 
575    Examine all the data references in the loop, and make sure there do not
576    exist any data dependences between them.  Set *MAX_VF according to
577    the maximum vectorization factor the data dependences allow.  */
578 
579 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
581 				   unsigned int *max_vf)
582 {
583   unsigned int i;
584   struct data_dependence_relation *ddr;
585 
586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 
588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589     {
590       LOOP_VINFO_DDRS (loop_vinfo)
591 	.create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
592 		 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
593       /* We do not need read-read dependences.  */
594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 					  &LOOP_VINFO_DDRS (loop_vinfo),
596 					  LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 					  false);
598       gcc_assert (res);
599     }
600 
601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602 
603   /* For epilogues we either have no aliases or alias versioning
604      was applied to original loop.  Therefore we may just get max_vf
605      using VF of original loop.  */
606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608   else
609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610       {
611 	opt_result res
612 	  = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 	if (!res)
614 	  return res;
615       }
616 
617   return opt_result::success ();
618 }
619 
620 
621 /* Function vect_slp_analyze_data_ref_dependence.
622 
623    Return TRUE if there (might) exist a dependence between a memory-reference
624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
626    according to the data dependence.  */
627 
628 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 				      struct data_dependence_relation *ddr)
631 {
632   struct data_reference *dra = DDR_A (ddr);
633   struct data_reference *drb = DDR_B (ddr);
634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636 
637   /* We need to check dependences of statements marked as unvectorizable
638      as well, they still can prohibit vectorization.  */
639 
640   /* Independent data accesses.  */
641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642     return false;
643 
644   if (dra == drb)
645     return false;
646 
647   /* Read-read is OK.  */
648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
649     return false;
650 
651   /* If dra and drb are part of the same interleaving chain consider
652      them independent.  */
653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 	  == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656     return false;
657 
658   /* Unknown data dependence.  */
659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660     {
661       if  (dump_enabled_p ())
662 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 			 "can't determine dependence between %T and %T\n",
664 			 DR_REF (dra), DR_REF (drb));
665     }
666   else if (dump_enabled_p ())
667     dump_printf_loc (MSG_NOTE, vect_location,
668 		     "determined dependence between %T and %T\n",
669 		     DR_REF (dra), DR_REF (drb));
670 
671   return true;
672 }
673 
674 
675 /* Analyze dependences involved in the transform of a store SLP NODE.  */
676 
677 static bool
vect_slp_analyze_store_dependences(vec_info * vinfo,slp_tree node)678 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
679 {
680   /* This walks over all stmts involved in the SLP store done
681      in NODE verifying we can sink them up to the last stmt in the
682      group.  */
683   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
684   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
685 
686   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
687     {
688       stmt_vec_info access_info
689 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
690       if (access_info == last_access_info)
691 	continue;
692       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
693       ao_ref ref;
694       bool ref_initialized_p = false;
695       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
696 	   gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
697 	{
698 	  gimple *stmt = gsi_stmt (gsi);
699 	  if (! gimple_vuse (stmt))
700 	    continue;
701 
702 	  /* If we couldn't record a (single) data reference for this
703 	     stmt we have to resort to the alias oracle.  */
704 	  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
705 	  data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
706 	  if (!dr_b)
707 	    {
708 	      /* We are moving a store - this means
709 		 we cannot use TBAA for disambiguation.  */
710 	      if (!ref_initialized_p)
711 		ao_ref_init (&ref, DR_REF (dr_a));
712 	      if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
713 		  || ref_maybe_used_by_stmt_p (stmt, &ref, false))
714 		return false;
715 	      continue;
716 	    }
717 
718 	  gcc_assert (!gimple_visited_p (stmt));
719 
720 	  ddr_p ddr = initialize_data_dependence_relation (dr_a,
721 							   dr_b, vNULL);
722 	  bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
723 	  free_dependence_relation (ddr);
724 	  if (dependent)
725 	    return false;
726 	}
727     }
728   return true;
729 }
730 
731 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
732    contain the vector of scalar stores of this instance if we are
733    disambiguating the loads.  */
734 
735 static bool
vect_slp_analyze_load_dependences(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)736 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
737 				   vec<stmt_vec_info> stores,
738 				   stmt_vec_info last_store_info)
739 {
740   /* This walks over all stmts involved in the SLP load done
741      in NODE verifying we can hoist them up to the first stmt in the
742      group.  */
743   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
744   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
745 
746   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
747     {
748       stmt_vec_info access_info
749 	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
750       if (access_info == first_access_info)
751 	continue;
752       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
753       ao_ref ref;
754       bool ref_initialized_p = false;
755       hash_set<stmt_vec_info> grp_visited;
756       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
757 	   gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
758 	{
759 	  gimple *stmt = gsi_stmt (gsi);
760 	  if (! gimple_vdef (stmt))
761 	    continue;
762 
763 	  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
764 
765 	  /* If we run into a store of this same instance (we've just
766 	     marked those) then delay dependence checking until we run
767 	     into the last store because this is where it will have
768 	     been sunk to (and we verified that we can do that already).  */
769 	  if (gimple_visited_p (stmt))
770 	    {
771 	      if (stmt_info != last_store_info)
772 		continue;
773 
774 	      for (stmt_vec_info &store_info : stores)
775 		{
776 		  data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
777 		  ddr_p ddr = initialize_data_dependence_relation
778 				(dr_a, store_dr, vNULL);
779 		  bool dependent
780 		    = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
781 		  free_dependence_relation (ddr);
782 		  if (dependent)
783 		    return false;
784 		}
785 	      continue;
786 	    }
787 
788 	  auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
789 	    {
790 	      /* We are hoisting a load - this means we can use TBAA for
791 		 disambiguation.  */
792 	      if (!ref_initialized_p)
793 		ao_ref_init (&ref, DR_REF (dr_a));
794 	      if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
795 		{
796 		  /* If we couldn't record a (single) data reference for this
797 		     stmt we have to give up now.  */
798 		  data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
799 		  if (!dr_b)
800 		    return false;
801 		  ddr_p ddr = initialize_data_dependence_relation (dr_a,
802 								   dr_b, vNULL);
803 		  bool dependent
804 		    = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
805 		  free_dependence_relation (ddr);
806 		  if (dependent)
807 		    return false;
808 		}
809 	      /* No dependence.  */
810 	      return true;
811 	    };
812 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
813 	    {
814 	      /* When we run into a store group we have to honor
815 		 that earlier stores might be moved here.  We don't
816 		 know exactly which and where to since we lack a
817 		 back-mapping from DR to SLP node, so assume all
818 		 earlier stores are sunk here.  It's enough to
819 		 consider the last stmt of a group for this.
820 		 ???  Both this and the fact that we disregard that
821 		 the conflicting instance might be removed later
822 		 is overly conservative.  */
823 	      if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
824 		for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
825 		     store_info != NULL;
826 		     store_info = DR_GROUP_NEXT_ELEMENT (store_info))
827 		  if ((store_info == stmt_info
828 		       || get_later_stmt (store_info, stmt_info) == stmt_info)
829 		      && !check_hoist (store_info))
830 		    return false;
831 	    }
832 	  else
833 	    {
834 	      if (!check_hoist (stmt_info))
835 		return false;
836 	    }
837 	}
838     }
839   return true;
840 }
841 
842 
843 /* Function vect_analyze_data_ref_dependences.
844 
845    Examine all the data references in the basic-block, and make sure there
846    do not exist any data dependences between them.  Set *MAX_VF according to
847    the maximum vectorization factor the data dependences allow.  */
848 
849 bool
vect_slp_analyze_instance_dependence(vec_info * vinfo,slp_instance instance)850 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
851 {
852   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
853 
854   /* The stores of this instance are at the root of the SLP tree.  */
855   slp_tree store = NULL;
856   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
857     store = SLP_INSTANCE_TREE (instance);
858 
859   /* Verify we can sink stores to the vectorized stmt insert location.  */
860   stmt_vec_info last_store_info = NULL;
861   if (store)
862     {
863       if (! vect_slp_analyze_store_dependences (vinfo, store))
864 	return false;
865 
866       /* Mark stores in this instance and remember the last one.  */
867       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
868       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
869 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
870     }
871 
872   bool res = true;
873 
874   /* Verify we can sink loads to the vectorized stmt insert location,
875      special-casing stores of this instance.  */
876   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
877     if (! vect_slp_analyze_load_dependences (vinfo, load,
878 					     store
879 					     ? SLP_TREE_SCALAR_STMTS (store)
880 					     : vNULL, last_store_info))
881       {
882 	res = false;
883 	break;
884       }
885 
886   /* Unset the visited flag.  */
887   if (store)
888     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
889       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
890 
891   return res;
892 }
893 
894 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
895    applied.  */
896 
897 int
dr_misalignment(dr_vec_info * dr_info,tree vectype,poly_int64 offset)898 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
899 {
900   HOST_WIDE_INT diff = 0;
901   /* Alignment is only analyzed for the first element of a DR group,
902      use that but adjust misalignment by the offset of the access.  */
903   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
904     {
905       dr_vec_info *first_dr
906 	= STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
907       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
908 	 INTEGER_CSTs and the first element in the group has the lowest
909 	 address.  */
910       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
911 	      - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
912       gcc_assert (diff >= 0);
913       dr_info = first_dr;
914     }
915 
916   int misalign = dr_info->misalignment;
917   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
918   if (misalign == DR_MISALIGNMENT_UNKNOWN)
919     return misalign;
920 
921   /* If the access is only aligned for a vector type with smaller alignment
922      requirement the access has unknown misalignment.  */
923   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
924 		targetm.vectorize.preferred_vector_alignment (vectype)))
925     return DR_MISALIGNMENT_UNKNOWN;
926 
927   /* Apply the offset from the DR group start and the externally supplied
928      offset which can for example result from a negative stride access.  */
929   poly_int64 misalignment = misalign + diff + offset;
930 
931   /* vect_compute_data_ref_alignment will have ensured that target_alignment
932      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
933   unsigned HOST_WIDE_INT target_alignment_c
934     = dr_info->target_alignment.to_constant ();
935   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
936     return DR_MISALIGNMENT_UNKNOWN;
937   return misalign;
938 }
939 
940 /* Record the base alignment guarantee given by DRB, which occurs
941    in STMT_INFO.  */
942 
943 static void
vect_record_base_alignment(vec_info * vinfo,stmt_vec_info stmt_info,innermost_loop_behavior * drb)944 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
945 			    innermost_loop_behavior *drb)
946 {
947   bool existed;
948   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
949     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
950   if (!existed || entry.second->base_alignment < drb->base_alignment)
951     {
952       entry = std::make_pair (stmt_info, drb);
953       if (dump_enabled_p ())
954 	dump_printf_loc (MSG_NOTE, vect_location,
955 			 "recording new base alignment for %T\n"
956 			 "  alignment:    %d\n"
957 			 "  misalignment: %d\n"
958 			 "  based on:     %G",
959 			 drb->base_address,
960 			 drb->base_alignment,
961 			 drb->base_misalignment,
962 			 stmt_info->stmt);
963     }
964 }
965 
966 /* If the region we're going to vectorize is reached, all unconditional
967    data references occur at least once.  We can therefore pool the base
968    alignment guarantees from each unconditional reference.  Do this by
969    going through all the data references in VINFO and checking whether
970    the containing statement makes the reference unconditionally.  If so,
971    record the alignment of the base address in VINFO so that it can be
972    used for all other references with the same base.  */
973 
974 void
vect_record_base_alignments(vec_info * vinfo)975 vect_record_base_alignments (vec_info *vinfo)
976 {
977   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
978   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
979   for (data_reference *dr : vinfo->shared->datarefs)
980     {
981       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
982       stmt_vec_info stmt_info = dr_info->stmt;
983       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
984 	  && STMT_VINFO_VECTORIZABLE (stmt_info)
985 	  && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
986 	{
987 	  vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
988 
989 	  /* If DR is nested in the loop that is being vectorized, we can also
990 	     record the alignment of the base wrt the outer loop.  */
991 	  if (loop && nested_in_vect_loop_p (loop, stmt_info))
992 	    vect_record_base_alignment
993 	      (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
994 	}
995     }
996 }
997 
998 /* Function vect_compute_data_ref_alignment
999 
1000    Compute the misalignment of the data reference DR_INFO when vectorizing
1001    with VECTYPE.
1002 
1003    Output:
1004    1. initialized misalignment info for DR_INFO
1005 
1006    FOR NOW: No analysis is actually performed. Misalignment is calculated
1007    only for trivial cases. TODO.  */
1008 
1009 static void
vect_compute_data_ref_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype)1010 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1011 				 tree vectype)
1012 {
1013   stmt_vec_info stmt_info = dr_info->stmt;
1014   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1015   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1016   class loop *loop = NULL;
1017   tree ref = DR_REF (dr_info->dr);
1018 
1019   if (dump_enabled_p ())
1020     dump_printf_loc (MSG_NOTE, vect_location,
1021                      "vect_compute_data_ref_alignment:\n");
1022 
1023   if (loop_vinfo)
1024     loop = LOOP_VINFO_LOOP (loop_vinfo);
1025 
1026   /* Initialize misalignment to unknown.  */
1027   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1028 
1029   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1030     return;
1031 
1032   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1033   bool step_preserves_misalignment_p;
1034 
1035   poly_uint64 vector_alignment
1036     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1037 		 BITS_PER_UNIT);
1038   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1039 
1040   /* If the main loop has peeled for alignment we have no way of knowing
1041      whether the data accesses in the epilogues are aligned.  We can't at
1042      compile time answer the question whether we have entered the main loop or
1043      not.  Fixes PR 92351.  */
1044   if (loop_vinfo)
1045     {
1046       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1047       if (orig_loop_vinfo
1048 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1049 	return;
1050     }
1051 
1052   unsigned HOST_WIDE_INT vect_align_c;
1053   if (!vector_alignment.is_constant (&vect_align_c))
1054     return;
1055 
1056   /* No step for BB vectorization.  */
1057   if (!loop)
1058     {
1059       gcc_assert (integer_zerop (drb->step));
1060       step_preserves_misalignment_p = true;
1061     }
1062 
1063   /* In case the dataref is in an inner-loop of the loop that is being
1064      vectorized (LOOP), we use the base and misalignment information
1065      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1066      stays the same throughout the execution of the inner-loop, which is why
1067      we have to check that the stride of the dataref in the inner-loop evenly
1068      divides by the vector alignment.  */
1069   else if (nested_in_vect_loop_p (loop, stmt_info))
1070     {
1071       step_preserves_misalignment_p
1072 	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1073 
1074       if (dump_enabled_p ())
1075 	{
1076 	  if (step_preserves_misalignment_p)
1077 	    dump_printf_loc (MSG_NOTE, vect_location,
1078 			     "inner step divides the vector alignment.\n");
1079 	  else
1080 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081 			     "inner step doesn't divide the vector"
1082 			     " alignment.\n");
1083 	}
1084     }
1085 
1086   /* Similarly we can only use base and misalignment information relative to
1087      an innermost loop if the misalignment stays the same throughout the
1088      execution of the loop.  As above, this is the case if the stride of
1089      the dataref evenly divides by the alignment.  */
1090   else
1091     {
1092       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1093       step_preserves_misalignment_p
1094 	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1095 
1096       if (!step_preserves_misalignment_p && dump_enabled_p ())
1097 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 			 "step doesn't divide the vector alignment.\n");
1099     }
1100 
1101   unsigned int base_alignment = drb->base_alignment;
1102   unsigned int base_misalignment = drb->base_misalignment;
1103 
1104   /* Calculate the maximum of the pooled base address alignment and the
1105      alignment that we can compute for DR itself.  */
1106   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1107     = base_alignments->get (drb->base_address);
1108   if (entry
1109       && base_alignment < (*entry).second->base_alignment
1110       && (loop_vinfo
1111 	  || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1112 			      gimple_bb (entry->first->stmt))
1113 	      && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1114 		  || (entry->first->dr_aux.group <= dr_info->group)))))
1115     {
1116       base_alignment = entry->second->base_alignment;
1117       base_misalignment = entry->second->base_misalignment;
1118     }
1119 
1120   if (drb->offset_alignment < vect_align_c
1121       || !step_preserves_misalignment_p
1122       /* We need to know whether the step wrt the vectorized loop is
1123 	 negative when computing the starting misalignment below.  */
1124       || TREE_CODE (drb->step) != INTEGER_CST)
1125     {
1126       if (dump_enabled_p ())
1127 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 			 "Unknown alignment for access: %T\n", ref);
1129       return;
1130     }
1131 
1132   if (base_alignment < vect_align_c)
1133     {
1134       unsigned int max_alignment;
1135       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1136       if (max_alignment < vect_align_c
1137 	  || !vect_can_force_dr_alignment_p (base,
1138 					     vect_align_c * BITS_PER_UNIT))
1139 	{
1140 	  if (dump_enabled_p ())
1141 	    dump_printf_loc (MSG_NOTE, vect_location,
1142 			     "can't force alignment of ref: %T\n", ref);
1143 	  return;
1144 	}
1145 
1146       /* Force the alignment of the decl.
1147 	 NOTE: This is the only change to the code we make during
1148 	 the analysis phase, before deciding to vectorize the loop.  */
1149       if (dump_enabled_p ())
1150 	dump_printf_loc (MSG_NOTE, vect_location,
1151 			 "force alignment of %T\n", ref);
1152 
1153       dr_info->base_decl = base;
1154       dr_info->base_misaligned = true;
1155       base_misalignment = 0;
1156     }
1157   poly_int64 misalignment
1158     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1159 
1160   unsigned int const_misalignment;
1161   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1162     {
1163       if (dump_enabled_p ())
1164 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 			 "Non-constant misalignment for access: %T\n", ref);
1166       return;
1167     }
1168 
1169   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1170 
1171   if (dump_enabled_p ())
1172     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173 		     "misalign = %d bytes of ref %T\n",
1174 		     const_misalignment, ref);
1175 
1176   return;
1177 }
1178 
1179 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1180    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1181    is made aligned via peeling.  */
1182 
1183 static bool
vect_dr_aligned_if_related_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1184 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1185 					 dr_vec_info *dr_peel_info)
1186 {
1187   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1188 		  DR_TARGET_ALIGNMENT (dr_info)))
1189     {
1190       poly_offset_int diff
1191 	= (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1192 	   - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1193       if (known_eq (diff, 0)
1194 	  || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1195 	return true;
1196     }
1197   return false;
1198 }
1199 
1200 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1201    aligned via peeling.  */
1202 
1203 static bool
vect_dr_aligned_if_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1204 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1205 				 dr_vec_info *dr_peel_info)
1206 {
1207   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1208 			DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1209       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1210 			   DR_OFFSET (dr_peel_info->dr), 0)
1211       || !operand_equal_p (DR_STEP (dr_info->dr),
1212 			   DR_STEP (dr_peel_info->dr), 0))
1213     return false;
1214 
1215   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1216 }
1217 
1218 /* Compute the value for dr_info->misalign so that the access appears
1219    aligned.  This is used by peeling to compensate for dr_misalignment
1220    applying the offset for negative step.  */
1221 
1222 int
vect_dr_misalign_for_aligned_access(dr_vec_info * dr_info)1223 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1224 {
1225   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1226     return 0;
1227 
1228   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1229   poly_int64 misalignment
1230     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1231        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1232 
1233   unsigned HOST_WIDE_INT target_alignment_c;
1234   int misalign;
1235   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1236       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1237     return DR_MISALIGNMENT_UNKNOWN;
1238   return misalign;
1239 }
1240 
1241 /* Function vect_update_misalignment_for_peel.
1242    Sets DR_INFO's misalignment
1243    - to 0 if it has the same alignment as DR_PEEL_INFO,
1244    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1245    - to -1 (unknown) otherwise.
1246 
1247    DR_INFO - the data reference whose misalignment is to be adjusted.
1248    DR_PEEL_INFO - the data reference whose misalignment is being made
1249 		  zero in the vector loop by the peel.
1250    NPEEL - the number of iterations in the peel loop if the misalignment
1251            of DR_PEEL_INFO is known at compile time.  */
1252 
1253 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1254 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1255 				   dr_vec_info *dr_peel_info, int npeel)
1256 {
1257   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1258   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1259     {
1260       SET_DR_MISALIGNMENT (dr_info,
1261 			   vect_dr_misalign_for_aligned_access (dr_peel_info));
1262       return;
1263     }
1264 
1265   unsigned HOST_WIDE_INT alignment;
1266   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1267       && known_alignment_for_access_p (dr_info,
1268 				       STMT_VINFO_VECTYPE (dr_info->stmt))
1269       && known_alignment_for_access_p (dr_peel_info,
1270 				       STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1271     {
1272       int misal = dr_info->misalignment;
1273       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1274       misal &= alignment - 1;
1275       set_dr_misalignment (dr_info, misal);
1276       return;
1277     }
1278 
1279   if (dump_enabled_p ())
1280     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1281 		     "to unknown (-1).\n");
1282   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1283 }
1284 
1285 /* Return true if alignment is relevant for DR_INFO.  */
1286 
1287 static bool
vect_relevant_for_alignment_p(dr_vec_info * dr_info)1288 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1289 {
1290   stmt_vec_info stmt_info = dr_info->stmt;
1291 
1292   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1293     return false;
1294 
1295   /* For interleaving, only the alignment of the first access matters.  */
1296   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1297       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1298     return false;
1299 
1300   /* Scatter-gather and invariant accesses continue to address individual
1301      scalars, so vector-level alignment is irrelevant.  */
1302   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1303       || integer_zerop (DR_STEP (dr_info->dr)))
1304     return false;
1305 
1306   /* Strided accesses perform only component accesses, alignment is
1307      irrelevant for them.  */
1308   if (STMT_VINFO_STRIDED_P (stmt_info)
1309       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1310     return false;
1311 
1312   return true;
1313 }
1314 
1315 /* Given an memory reference EXP return whether its alignment is less
1316    than its size.  */
1317 
1318 static bool
not_size_aligned(tree exp)1319 not_size_aligned (tree exp)
1320 {
1321   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1322     return true;
1323 
1324   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1325 	  > get_object_alignment (exp));
1326 }
1327 
1328 /* Function vector_alignment_reachable_p
1329 
1330    Return true if vector alignment for DR_INFO is reachable by peeling
1331    a few loop iterations.  Return false otherwise.  */
1332 
1333 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1334 vector_alignment_reachable_p (dr_vec_info *dr_info)
1335 {
1336   stmt_vec_info stmt_info = dr_info->stmt;
1337   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1338 
1339   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1340     {
1341       /* For interleaved access we peel only if number of iterations in
1342 	 the prolog loop ({VF - misalignment}), is a multiple of the
1343 	 number of the interleaved accesses.  */
1344       int elem_size, mis_in_elements;
1345 
1346       /* FORNOW: handle only known alignment.  */
1347       if (!known_alignment_for_access_p (dr_info, vectype))
1348 	return false;
1349 
1350       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1351       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1352       elem_size = vector_element_size (vector_size, nelements);
1353       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1354 
1355       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1356 	return false;
1357     }
1358 
1359   /* If misalignment is known at the compile time then allow peeling
1360      only if natural alignment is reachable through peeling.  */
1361   if (known_alignment_for_access_p (dr_info, vectype)
1362       && !aligned_access_p (dr_info, vectype))
1363     {
1364       HOST_WIDE_INT elmsize =
1365 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1366       if (dump_enabled_p ())
1367 	{
1368 	  dump_printf_loc (MSG_NOTE, vect_location,
1369 	                   "data size = %wd. misalignment = %d.\n", elmsize,
1370 			   dr_misalignment (dr_info, vectype));
1371 	}
1372       if (dr_misalignment (dr_info, vectype) % elmsize)
1373 	{
1374 	  if (dump_enabled_p ())
1375 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 	                     "data size does not divide the misalignment.\n");
1377 	  return false;
1378 	}
1379     }
1380 
1381   if (!known_alignment_for_access_p (dr_info, vectype))
1382     {
1383       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1384       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1385       if (dump_enabled_p ())
1386 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 	                 "Unknown misalignment, %snaturally aligned\n",
1388 			 is_packed ? "not " : "");
1389       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1390     }
1391 
1392   return true;
1393 }
1394 
1395 
1396 /* Calculate the cost of the memory access represented by DR_INFO.  */
1397 
1398 static void
vect_get_data_access_cost(vec_info * vinfo,dr_vec_info * dr_info,dr_alignment_support alignment_support_scheme,int misalignment,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1399 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1400 			   dr_alignment_support alignment_support_scheme,
1401 			   int misalignment,
1402 			   unsigned int *inside_cost,
1403                            unsigned int *outside_cost,
1404 			   stmt_vector_for_cost *body_cost_vec,
1405 			   stmt_vector_for_cost *prologue_cost_vec)
1406 {
1407   stmt_vec_info stmt_info = dr_info->stmt;
1408   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1409   int ncopies;
1410 
1411   if (PURE_SLP_STMT (stmt_info))
1412     ncopies = 1;
1413   else
1414     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1415 
1416   if (DR_IS_READ (dr_info->dr))
1417     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1418 			misalignment, true, inside_cost,
1419 			outside_cost, prologue_cost_vec, body_cost_vec, false);
1420   else
1421     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1422 			 misalignment, inside_cost, body_cost_vec);
1423 
1424   if (dump_enabled_p ())
1425     dump_printf_loc (MSG_NOTE, vect_location,
1426                      "vect_get_data_access_cost: inside_cost = %d, "
1427                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1428 }
1429 
1430 
1431 typedef struct _vect_peel_info
1432 {
1433   dr_vec_info *dr_info;
1434   int npeel;
1435   unsigned int count;
1436 } *vect_peel_info;
1437 
1438 typedef struct _vect_peel_extended_info
1439 {
1440   vec_info *vinfo;
1441   struct _vect_peel_info peel_info;
1442   unsigned int inside_cost;
1443   unsigned int outside_cost;
1444 } *vect_peel_extended_info;
1445 
1446 
1447 /* Peeling hashtable helpers.  */
1448 
1449 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1450 {
1451   static inline hashval_t hash (const _vect_peel_info *);
1452   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1453 };
1454 
1455 inline hashval_t
hash(const _vect_peel_info * peel_info)1456 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1457 {
1458   return (hashval_t) peel_info->npeel;
1459 }
1460 
1461 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1462 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1463 {
1464   return (a->npeel == b->npeel);
1465 }
1466 
1467 
1468 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1469 
1470 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel,bool supportable_if_not_aligned)1471 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1472 			  loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1473 			  int npeel, bool supportable_if_not_aligned)
1474 {
1475   struct _vect_peel_info elem, *slot;
1476   _vect_peel_info **new_slot;
1477 
1478   elem.npeel = npeel;
1479   slot = peeling_htab->find (&elem);
1480   if (slot)
1481     slot->count++;
1482   else
1483     {
1484       slot = XNEW (struct _vect_peel_info);
1485       slot->npeel = npeel;
1486       slot->dr_info = dr_info;
1487       slot->count = 1;
1488       new_slot = peeling_htab->find_slot (slot, INSERT);
1489       *new_slot = slot;
1490     }
1491 
1492   /* If this DR is not supported with unknown misalignment then bias
1493      this slot when the cost model is disabled.  */
1494   if (!supportable_if_not_aligned
1495       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1496     slot->count += VECT_MAX_COST;
1497 }
1498 
1499 
1500 /* Traverse peeling hash table to find peeling option that aligns maximum
1501    number of data accesses.  */
1502 
1503 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1504 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1505 				     _vect_peel_extended_info *max)
1506 {
1507   vect_peel_info elem = *slot;
1508 
1509   if (elem->count > max->peel_info.count
1510       || (elem->count == max->peel_info.count
1511           && max->peel_info.npeel > elem->npeel))
1512     {
1513       max->peel_info.npeel = elem->npeel;
1514       max->peel_info.count = elem->count;
1515       max->peel_info.dr_info = elem->dr_info;
1516     }
1517 
1518   return 1;
1519 }
1520 
1521 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1522    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1523    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1524    after peeling.  */
1525 
1526 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel)1527 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1528 				dr_vec_info *dr0_info,
1529 				unsigned int *inside_cost,
1530 				unsigned int *outside_cost,
1531 				stmt_vector_for_cost *body_cost_vec,
1532 				stmt_vector_for_cost *prologue_cost_vec,
1533 				unsigned int npeel)
1534 {
1535   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1536 
1537   bool dr0_alignment_known_p
1538     = (dr0_info
1539        && known_alignment_for_access_p (dr0_info,
1540 					STMT_VINFO_VECTYPE (dr0_info->stmt)));
1541 
1542   for (data_reference *dr : datarefs)
1543     {
1544       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1545       if (!vect_relevant_for_alignment_p (dr_info))
1546 	continue;
1547 
1548       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1549       dr_alignment_support alignment_support_scheme;
1550       int misalignment;
1551       unsigned HOST_WIDE_INT alignment;
1552 
1553       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1554 					    size_zero_node) < 0;
1555       poly_int64 off = 0;
1556       if (negative)
1557 	off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1558 	       * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1559 
1560       if (npeel == 0)
1561 	misalignment = dr_misalignment (dr_info, vectype, off);
1562       else if (dr_info == dr0_info
1563 	       || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1564 	misalignment = 0;
1565       else if (!dr0_alignment_known_p
1566 	       || !known_alignment_for_access_p (dr_info, vectype)
1567 	       || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1568 	misalignment = DR_MISALIGNMENT_UNKNOWN;
1569       else
1570 	{
1571 	  misalignment = dr_misalignment (dr_info, vectype, off);
1572 	  misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1573 	  misalignment &= alignment - 1;
1574 	}
1575       alignment_support_scheme
1576 	= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1577 					 misalignment);
1578 
1579       vect_get_data_access_cost (loop_vinfo, dr_info,
1580 				 alignment_support_scheme, misalignment,
1581 				 inside_cost, outside_cost,
1582 				 body_cost_vec, prologue_cost_vec);
1583     }
1584 }
1585 
1586 /* Traverse peeling hash table and calculate cost for each peeling option.
1587    Find the one with the lowest cost.  */
1588 
1589 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1590 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1591 				   _vect_peel_extended_info *min)
1592 {
1593   vect_peel_info elem = *slot;
1594   int dummy;
1595   unsigned int inside_cost = 0, outside_cost = 0;
1596   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1597   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1598 		       epilogue_cost_vec;
1599 
1600   prologue_cost_vec.create (2);
1601   body_cost_vec.create (2);
1602   epilogue_cost_vec.create (2);
1603 
1604   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1605 				  &outside_cost, &body_cost_vec,
1606 				  &prologue_cost_vec, elem->npeel);
1607 
1608   body_cost_vec.release ();
1609 
1610   outside_cost += vect_get_known_peeling_cost
1611     (loop_vinfo, elem->npeel, &dummy,
1612      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1613      &prologue_cost_vec, &epilogue_cost_vec);
1614 
1615   /* Prologue and epilogue costs are added to the target model later.
1616      These costs depend only on the scalar iteration cost, the
1617      number of peeling iterations finally chosen, and the number of
1618      misaligned statements.  So discard the information found here.  */
1619   prologue_cost_vec.release ();
1620   epilogue_cost_vec.release ();
1621 
1622   if (inside_cost < min->inside_cost
1623       || (inside_cost == min->inside_cost
1624 	  && outside_cost < min->outside_cost))
1625     {
1626       min->inside_cost = inside_cost;
1627       min->outside_cost = outside_cost;
1628       min->peel_info.dr_info = elem->dr_info;
1629       min->peel_info.npeel = elem->npeel;
1630       min->peel_info.count = elem->count;
1631     }
1632 
1633   return 1;
1634 }
1635 
1636 
1637 /* Choose best peeling option by traversing peeling hash table and either
1638    choosing an option with the lowest cost (if cost model is enabled) or the
1639    option that aligns as many accesses as possible.  */
1640 
1641 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1642 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1643 				       loop_vec_info loop_vinfo)
1644 {
1645    struct _vect_peel_extended_info res;
1646 
1647    res.peel_info.dr_info = NULL;
1648    res.vinfo = loop_vinfo;
1649 
1650    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1651      {
1652        res.inside_cost = INT_MAX;
1653        res.outside_cost = INT_MAX;
1654        peeling_htab->traverse <_vect_peel_extended_info *,
1655 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1656      }
1657    else
1658      {
1659        res.peel_info.count = 0;
1660        peeling_htab->traverse <_vect_peel_extended_info *,
1661 	   		       vect_peeling_hash_get_most_frequent> (&res);
1662        res.inside_cost = 0;
1663        res.outside_cost = 0;
1664      }
1665 
1666    return res;
1667 }
1668 
1669 /* Return true if the new peeling NPEEL is supported.  */
1670 
1671 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1672 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1673 			  unsigned npeel)
1674 {
1675   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1676   enum dr_alignment_support supportable_dr_alignment;
1677 
1678   bool dr0_alignment_known_p
1679     = known_alignment_for_access_p (dr0_info,
1680 				    STMT_VINFO_VECTYPE (dr0_info->stmt));
1681 
1682   /* Ensure that all data refs can be vectorized after the peel.  */
1683   for (data_reference *dr : datarefs)
1684     {
1685       if (dr == dr0_info->dr)
1686 	continue;
1687 
1688       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1689       if (!vect_relevant_for_alignment_p (dr_info)
1690 	  || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1691 	continue;
1692 
1693       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1694       int misalignment;
1695       unsigned HOST_WIDE_INT alignment;
1696       if (!dr0_alignment_known_p
1697 	  || !known_alignment_for_access_p (dr_info, vectype)
1698 	  || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1699 	misalignment = DR_MISALIGNMENT_UNKNOWN;
1700       else
1701 	{
1702 	  misalignment = dr_misalignment (dr_info, vectype);
1703 	  misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1704 	  misalignment &= alignment - 1;
1705 	}
1706       supportable_dr_alignment
1707 	= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1708 					 misalignment);
1709       if (supportable_dr_alignment == dr_unaligned_unsupported)
1710 	return false;
1711     }
1712 
1713   return true;
1714 }
1715 
1716 /* Compare two data-references DRA and DRB to group them into chunks
1717    with related alignment.  */
1718 
1719 static int
dr_align_group_sort_cmp(const void * dra_,const void * drb_)1720 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1721 {
1722   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1723   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1724   int cmp;
1725 
1726   /* Stabilize sort.  */
1727   if (dra == drb)
1728     return 0;
1729 
1730   /* Ordering of DRs according to base.  */
1731   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1732 			       DR_BASE_ADDRESS (drb));
1733   if (cmp != 0)
1734     return cmp;
1735 
1736   /* And according to DR_OFFSET.  */
1737   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1738   if (cmp != 0)
1739     return cmp;
1740 
1741   /* And after step.  */
1742   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1743   if (cmp != 0)
1744     return cmp;
1745 
1746   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1747   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1748   if (cmp == 0)
1749     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1750   return cmp;
1751 }
1752 
1753 /* Function vect_enhance_data_refs_alignment
1754 
1755    This pass will use loop versioning and loop peeling in order to enhance
1756    the alignment of data references in the loop.
1757 
1758    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1759    original loop is to be vectorized.  Any other loops that are created by
1760    the transformations performed in this pass - are not supposed to be
1761    vectorized.  This restriction will be relaxed.
1762 
1763    This pass will require a cost model to guide it whether to apply peeling
1764    or versioning or a combination of the two.  For example, the scheme that
1765    intel uses when given a loop with several memory accesses, is as follows:
1766    choose one memory access ('p') which alignment you want to force by doing
1767    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1768    other accesses are not necessarily aligned, or (2) use loop versioning to
1769    generate one loop in which all accesses are aligned, and another loop in
1770    which only 'p' is necessarily aligned.
1771 
1772    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1773    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1774    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1775 
1776    Devising a cost model is the most critical aspect of this work.  It will
1777    guide us on which access to peel for, whether to use loop versioning, how
1778    many versions to create, etc.  The cost model will probably consist of
1779    generic considerations as well as target specific considerations (on
1780    powerpc for example, misaligned stores are more painful than misaligned
1781    loads).
1782 
1783    Here are the general steps involved in alignment enhancements:
1784 
1785      -- original loop, before alignment analysis:
1786 	for (i=0; i<N; i++){
1787 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1788 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1789 	}
1790 
1791      -- After vect_compute_data_refs_alignment:
1792 	for (i=0; i<N; i++){
1793 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1794 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1795 	}
1796 
1797      -- Possibility 1: we do loop versioning:
1798      if (p is aligned) {
1799 	for (i=0; i<N; i++){	# loop 1A
1800 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1801 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1802 	}
1803      }
1804      else {
1805 	for (i=0; i<N; i++){	# loop 1B
1806 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1807 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1808 	}
1809      }
1810 
1811      -- Possibility 2: we do loop peeling:
1812      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1813 	x = q[i];
1814 	p[i] = y;
1815      }
1816      for (i = 3; i < N; i++){	# loop 2A
1817 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1818 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1819      }
1820 
1821      -- Possibility 3: combination of loop peeling and versioning:
1822      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1823 	x = q[i];
1824 	p[i] = y;
1825      }
1826      if (p is aligned) {
1827 	for (i = 3; i<N; i++){	# loop 3A
1828 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1829 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1830 	}
1831      }
1832      else {
1833 	for (i = 3; i<N; i++){	# loop 3B
1834 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1835 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1836 	}
1837      }
1838 
1839      These loops are later passed to loop_transform to be vectorized.  The
1840      vectorizer will use the alignment information to guide the transformation
1841      (whether to generate regular loads/stores, or with special handling for
1842      misalignment).  */
1843 
1844 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1845 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1846 {
1847   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1848   dr_vec_info *first_store = NULL;
1849   dr_vec_info *dr0_info = NULL;
1850   struct data_reference *dr;
1851   unsigned int i;
1852   bool do_peeling = false;
1853   bool do_versioning = false;
1854   unsigned int npeel = 0;
1855   bool one_misalignment_known = false;
1856   bool one_misalignment_unknown = false;
1857   bool one_dr_unsupportable = false;
1858   dr_vec_info *unsupportable_dr_info = NULL;
1859   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1860   hash_table<peel_info_hasher> peeling_htab (1);
1861 
1862   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1863 
1864   /* Reset data so we can safely be called multiple times.  */
1865   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1866   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1867 
1868   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1869     return opt_result::success ();
1870 
1871   /* Sort the vector of datarefs so DRs that have the same or dependent
1872      alignment are next to each other.  */
1873   auto_vec<data_reference_p> datarefs
1874     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1875   datarefs.qsort (dr_align_group_sort_cmp);
1876 
1877   /* Compute the number of DRs that become aligned when we peel
1878      a dataref so it becomes aligned.  */
1879   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1880   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1881   unsigned i0;
1882   for (i0 = 0; i0 < datarefs.length (); ++i0)
1883     if (DR_BASE_ADDRESS (datarefs[i0]))
1884       break;
1885   for (i = i0 + 1; i <= datarefs.length (); ++i)
1886     {
1887       if (i == datarefs.length ()
1888 	  || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1889 			       DR_BASE_ADDRESS (datarefs[i]), 0)
1890 	  || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1891 			       DR_OFFSET (datarefs[i]), 0)
1892 	  || !operand_equal_p (DR_STEP (datarefs[i0]),
1893 			       DR_STEP (datarefs[i]), 0))
1894 	{
1895 	  /* The subgroup [i0, i-1] now only differs in DR_INIT and
1896 	     possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1897 	     will get known misalignment if we align one of the refs
1898 	     with the largest DR_TARGET_ALIGNMENT.  */
1899 	  for (unsigned j = i0; j < i; ++j)
1900 	    {
1901 	      dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1902 	      for (unsigned k = i0; k < i; ++k)
1903 		{
1904 		  if (k == j)
1905 		    continue;
1906 		  dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1907 		  if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1908 							       dr_infoj))
1909 		    n_same_align_refs[j]++;
1910 		}
1911 	    }
1912 	  i0 = i;
1913 	}
1914     }
1915 
1916   /* While cost model enhancements are expected in the future, the high level
1917      view of the code at this time is as follows:
1918 
1919      A) If there is a misaligned access then see if peeling to align
1920         this access can make all data references satisfy
1921         vect_supportable_dr_alignment.  If so, update data structures
1922         as needed and return true.
1923 
1924      B) If peeling wasn't possible and there is a data reference with an
1925         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1926         then see if loop versioning checks can be used to make all data
1927         references satisfy vect_supportable_dr_alignment.  If so, update
1928         data structures as needed and return true.
1929 
1930      C) If neither peeling nor versioning were successful then return false if
1931         any data reference does not satisfy vect_supportable_dr_alignment.
1932 
1933      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1934 
1935      Note, Possibility 3 above (which is peeling and versioning together) is not
1936      being done at this time.  */
1937 
1938   /* (1) Peeling to force alignment.  */
1939 
1940   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1941      Considerations:
1942      + How many accesses will become aligned due to the peeling
1943      - How many accesses will become unaligned due to the peeling,
1944        and the cost of misaligned accesses.
1945      - The cost of peeling (the extra runtime checks, the increase
1946        in code size).  */
1947 
1948   FOR_EACH_VEC_ELT (datarefs, i, dr)
1949     {
1950       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1951       if (!vect_relevant_for_alignment_p (dr_info))
1952 	continue;
1953 
1954       stmt_vec_info stmt_info = dr_info->stmt;
1955       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1956       do_peeling = vector_alignment_reachable_p (dr_info);
1957       if (do_peeling)
1958         {
1959 	  if (known_alignment_for_access_p (dr_info, vectype))
1960             {
1961 	      unsigned int npeel_tmp = 0;
1962 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1963 						    size_zero_node) < 0;
1964 
1965 	      /* If known_alignment_for_access_p then we have set
1966 	         DR_MISALIGNMENT which is only done if we know it at compiler
1967 	         time, so it is safe to assume target alignment is constant.
1968 	       */
1969 	      unsigned int target_align =
1970 		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1971 	      unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1972 	      poly_int64 off = 0;
1973 	      if (negative)
1974 		off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1975 	      unsigned int mis = dr_misalignment (dr_info, vectype, off);
1976 	      mis = negative ? mis : -mis;
1977 	      if (mis != 0)
1978 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1979 
1980               /* For multiple types, it is possible that the bigger type access
1981                  will have more than one peeling option.  E.g., a loop with two
1982                  types: one of size (vector size / 4), and the other one of
1983                  size (vector size / 8).  Vectorization factor will 8.  If both
1984                  accesses are misaligned by 3, the first one needs one scalar
1985                  iteration to be aligned, and the second one needs 5.  But the
1986 		 first one will be aligned also by peeling 5 scalar
1987                  iterations, and in that case both accesses will be aligned.
1988                  Hence, except for the immediate peeling amount, we also want
1989                  to try to add full vector size, while we don't exceed
1990                  vectorization factor.
1991                  We do this automatically for cost model, since we calculate
1992 		 cost for every peeling option.  */
1993 	      poly_uint64 nscalars = npeel_tmp;
1994               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1995 		{
1996 		  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997 		  nscalars = (STMT_SLP_TYPE (stmt_info)
1998 			      ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1999 		}
2000 
2001 	      /* Save info about DR in the hash table.  Also include peeling
2002 		 amounts according to the explanation above.  Indicate
2003 		 the alignment status when the ref is not aligned.
2004 		 ???  Rather than using unknown alignment here we should
2005 		 prune all entries from the peeling hashtable which cause
2006 		 DRs to be not supported.  */
2007 	      bool supportable_if_not_aligned
2008 		= vect_supportable_dr_alignment
2009 		    (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2010 	      while (known_le (npeel_tmp, nscalars))
2011                 {
2012                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2013 					    dr_info, npeel_tmp,
2014 					    supportable_if_not_aligned);
2015 		  npeel_tmp += MAX (1, target_align / dr_size);
2016                 }
2017 
2018 	      one_misalignment_known = true;
2019             }
2020           else
2021             {
2022               /* If we don't know any misalignment values, we prefer
2023                  peeling for data-ref that has the maximum number of data-refs
2024                  with the same alignment, unless the target prefers to align
2025                  stores over load.  */
2026 	      unsigned same_align_drs = n_same_align_refs[i];
2027 	      if (!dr0_info
2028 		  || dr0_same_align_drs < same_align_drs)
2029 		{
2030 		  dr0_same_align_drs = same_align_drs;
2031 		  dr0_info = dr_info;
2032 		}
2033 	      /* For data-refs with the same number of related
2034 		 accesses prefer the one where the misalign
2035 		 computation will be invariant in the outermost loop.  */
2036 	      else if (dr0_same_align_drs == same_align_drs)
2037 		{
2038 		  class loop *ivloop0, *ivloop;
2039 		  ivloop0 = outermost_invariant_loop_for_expr
2040 		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
2041 		  ivloop = outermost_invariant_loop_for_expr
2042 		    (loop, DR_BASE_ADDRESS (dr));
2043 		  if ((ivloop && !ivloop0)
2044 		      || (ivloop && ivloop0
2045 			  && flow_loop_nested_p (ivloop, ivloop0)))
2046 		    dr0_info = dr_info;
2047 		}
2048 
2049 	      one_misalignment_unknown = true;
2050 
2051 	      /* Check for data refs with unsupportable alignment that
2052 	         can be peeled.  */
2053 	      enum dr_alignment_support supportable_dr_alignment
2054 		= vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2055 						 DR_MISALIGNMENT_UNKNOWN);
2056 	      if (supportable_dr_alignment == dr_unaligned_unsupported)
2057 		{
2058 		  one_dr_unsupportable = true;
2059 		  unsupportable_dr_info = dr_info;
2060 		}
2061 
2062 	      if (!first_store && DR_IS_WRITE (dr))
2063 		{
2064 		  first_store = dr_info;
2065 		  first_store_same_align_drs = same_align_drs;
2066 		}
2067             }
2068         }
2069       else
2070         {
2071 	  if (!aligned_access_p (dr_info, vectype))
2072             {
2073               if (dump_enabled_p ())
2074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075                                  "vector alignment may not be reachable\n");
2076               break;
2077             }
2078         }
2079     }
2080 
2081   /* Check if we can possibly peel the loop.  */
2082   if (!vect_can_advance_ivs_p (loop_vinfo)
2083       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2084       || loop->inner)
2085     do_peeling = false;
2086 
2087   struct _vect_peel_extended_info peel_for_known_alignment;
2088   struct _vect_peel_extended_info peel_for_unknown_alignment;
2089   struct _vect_peel_extended_info best_peel;
2090 
2091   peel_for_unknown_alignment.inside_cost = INT_MAX;
2092   peel_for_unknown_alignment.outside_cost = INT_MAX;
2093   peel_for_unknown_alignment.peel_info.count = 0;
2094 
2095   if (do_peeling
2096       && one_misalignment_unknown)
2097     {
2098       /* Check if the target requires to prefer stores over loads, i.e., if
2099          misaligned stores are more expensive than misaligned loads (taking
2100          drs with same alignment into account).  */
2101       unsigned int load_inside_cost = 0;
2102       unsigned int load_outside_cost = 0;
2103       unsigned int store_inside_cost = 0;
2104       unsigned int store_outside_cost = 0;
2105       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2106 
2107       stmt_vector_for_cost dummy;
2108       dummy.create (2);
2109       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2110 				      &load_inside_cost,
2111 				      &load_outside_cost,
2112 				      &dummy, &dummy, estimated_npeels);
2113       dummy.release ();
2114 
2115       if (first_store)
2116 	{
2117 	  dummy.create (2);
2118 	  vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2119 					  &store_inside_cost,
2120 					  &store_outside_cost,
2121 					  &dummy, &dummy,
2122 					  estimated_npeels);
2123 	  dummy.release ();
2124 	}
2125       else
2126 	{
2127 	  store_inside_cost = INT_MAX;
2128 	  store_outside_cost = INT_MAX;
2129 	}
2130 
2131       if (load_inside_cost > store_inside_cost
2132 	  || (load_inside_cost == store_inside_cost
2133 	      && load_outside_cost > store_outside_cost))
2134 	{
2135 	  dr0_info = first_store;
2136 	  dr0_same_align_drs = first_store_same_align_drs;
2137 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
2138 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
2139 	}
2140       else
2141 	{
2142 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
2143 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
2144 	}
2145 
2146       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2147       prologue_cost_vec.create (2);
2148       epilogue_cost_vec.create (2);
2149 
2150       int dummy2;
2151       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2152 	(loop_vinfo, estimated_npeels, &dummy2,
2153 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2154 	 &prologue_cost_vec, &epilogue_cost_vec);
2155 
2156       prologue_cost_vec.release ();
2157       epilogue_cost_vec.release ();
2158 
2159       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2160     }
2161 
2162   peel_for_unknown_alignment.peel_info.npeel = 0;
2163   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2164 
2165   best_peel = peel_for_unknown_alignment;
2166 
2167   peel_for_known_alignment.inside_cost = INT_MAX;
2168   peel_for_known_alignment.outside_cost = INT_MAX;
2169   peel_for_known_alignment.peel_info.count = 0;
2170   peel_for_known_alignment.peel_info.dr_info = NULL;
2171 
2172   if (do_peeling && one_misalignment_known)
2173     {
2174       /* Peeling is possible, but there is no data access that is not supported
2175          unless aligned.  So we try to choose the best possible peeling from
2176 	 the hash table.  */
2177       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2178 	(&peeling_htab, loop_vinfo);
2179     }
2180 
2181   /* Compare costs of peeling for known and unknown alignment. */
2182   if (peel_for_known_alignment.peel_info.dr_info != NULL
2183       && peel_for_unknown_alignment.inside_cost
2184       >= peel_for_known_alignment.inside_cost)
2185     {
2186       best_peel = peel_for_known_alignment;
2187 
2188       /* If the best peeling for known alignment has NPEEL == 0, perform no
2189          peeling at all except if there is an unsupportable dr that we can
2190          align.  */
2191       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2192 	do_peeling = false;
2193     }
2194 
2195   /* If there is an unsupportable data ref, prefer this over all choices so far
2196      since we'd have to discard a chosen peeling except when it accidentally
2197      aligned the unsupportable data ref.  */
2198   if (one_dr_unsupportable)
2199     dr0_info = unsupportable_dr_info;
2200   else if (do_peeling)
2201     {
2202       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2203 	 TODO: Use nopeel_outside_cost or get rid of it?  */
2204       unsigned nopeel_inside_cost = 0;
2205       unsigned nopeel_outside_cost = 0;
2206 
2207       stmt_vector_for_cost dummy;
2208       dummy.create (2);
2209       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2210 				      &nopeel_outside_cost, &dummy, &dummy, 0);
2211       dummy.release ();
2212 
2213       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2214 	 costs will be recorded.  */
2215       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2216       prologue_cost_vec.create (2);
2217       epilogue_cost_vec.create (2);
2218 
2219       int dummy2;
2220       nopeel_outside_cost += vect_get_known_peeling_cost
2221 	(loop_vinfo, 0, &dummy2,
2222 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2223 	 &prologue_cost_vec, &epilogue_cost_vec);
2224 
2225       prologue_cost_vec.release ();
2226       epilogue_cost_vec.release ();
2227 
2228       npeel = best_peel.peel_info.npeel;
2229       dr0_info = best_peel.peel_info.dr_info;
2230 
2231       /* If no peeling is not more expensive than the best peeling we
2232 	 have so far, don't perform any peeling.  */
2233       if (nopeel_inside_cost <= best_peel.inside_cost)
2234 	do_peeling = false;
2235     }
2236 
2237   if (do_peeling)
2238     {
2239       stmt_vec_info stmt_info = dr0_info->stmt;
2240       if (known_alignment_for_access_p (dr0_info,
2241 					STMT_VINFO_VECTYPE (stmt_info)))
2242         {
2243 	  bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2244 						size_zero_node) < 0;
2245           if (!npeel)
2246             {
2247               /* Since it's known at compile time, compute the number of
2248                  iterations in the peeled loop (the peeling factor) for use in
2249                  updating DR_MISALIGNMENT values.  The peeling factor is the
2250                  vectorization factor minus the misalignment as an element
2251                  count.  */
2252 	      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2253 	      poly_int64 off = 0;
2254 	      if (negative)
2255 		off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2256 		       * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2257 	      unsigned int mis
2258 		= dr_misalignment (dr0_info, vectype, off);
2259 	      mis = negative ? mis : -mis;
2260 	      /* If known_alignment_for_access_p then we have set
2261 	         DR_MISALIGNMENT which is only done if we know it at compiler
2262 	         time, so it is safe to assume target alignment is constant.
2263 	       */
2264 	      unsigned int target_align =
2265 		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2266 	      npeel = ((mis & (target_align - 1))
2267 		       / vect_get_scalar_dr_size (dr0_info));
2268             }
2269 
2270 	  /* For interleaved data access every iteration accesses all the
2271 	     members of the group, therefore we divide the number of iterations
2272 	     by the group size.  */
2273 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2274 	    npeel /= DR_GROUP_SIZE (stmt_info);
2275 
2276           if (dump_enabled_p ())
2277             dump_printf_loc (MSG_NOTE, vect_location,
2278                              "Try peeling by %d\n", npeel);
2279         }
2280 
2281       /* Ensure that all datarefs can be vectorized after the peel.  */
2282       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2283 	do_peeling = false;
2284 
2285       /* Check if all datarefs are supportable and log.  */
2286       if (do_peeling
2287 	  && npeel == 0
2288 	  && known_alignment_for_access_p (dr0_info,
2289 					   STMT_VINFO_VECTYPE (stmt_info)))
2290 	return opt_result::success ();
2291 
2292       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2293       if (do_peeling)
2294         {
2295           unsigned max_allowed_peel
2296 	    = param_vect_max_peeling_for_alignment;
2297 	  if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2298 	    max_allowed_peel = 0;
2299           if (max_allowed_peel != (unsigned)-1)
2300             {
2301               unsigned max_peel = npeel;
2302               if (max_peel == 0)
2303                 {
2304 		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2305 		  unsigned HOST_WIDE_INT target_align_c;
2306 		  if (target_align.is_constant (&target_align_c))
2307 		    max_peel =
2308 		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2309 		  else
2310 		    {
2311 		      do_peeling = false;
2312 		      if (dump_enabled_p ())
2313 			dump_printf_loc (MSG_NOTE, vect_location,
2314 			  "Disable peeling, max peels set and vector"
2315 			  " alignment unknown\n");
2316 		    }
2317                 }
2318               if (max_peel > max_allowed_peel)
2319                 {
2320                   do_peeling = false;
2321                   if (dump_enabled_p ())
2322                     dump_printf_loc (MSG_NOTE, vect_location,
2323                         "Disable peeling, max peels reached: %d\n", max_peel);
2324                 }
2325             }
2326         }
2327 
2328       /* Cost model #2 - if peeling may result in a remaining loop not
2329 	 iterating enough to be vectorized then do not peel.  Since this
2330 	 is a cost heuristic rather than a correctness decision, use the
2331 	 most likely runtime value for variable vectorization factors.  */
2332       if (do_peeling
2333 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2334 	{
2335 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2336 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2337 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2338 	      < assumed_vf + max_peel)
2339 	    do_peeling = false;
2340 	}
2341 
2342       if (do_peeling)
2343         {
2344           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2345              If the misalignment of DR_i is identical to that of dr0 then set
2346              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2347              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2348              by the peeling factor times the element size of DR_i (MOD the
2349              vectorization factor times the size).  Otherwise, the
2350              misalignment of DR_i must be set to unknown.  */
2351 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2352 	    if (dr != dr0_info->dr)
2353 	      {
2354 		dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2355 		if (!vect_relevant_for_alignment_p (dr_info))
2356 		  continue;
2357 
2358 		vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2359 	      }
2360 
2361           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2362           if (npeel)
2363             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2364           else
2365 	    LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2366 	  SET_DR_MISALIGNMENT (dr0_info,
2367 			       vect_dr_misalign_for_aligned_access (dr0_info));
2368 	  if (dump_enabled_p ())
2369             {
2370               dump_printf_loc (MSG_NOTE, vect_location,
2371                                "Alignment of access forced using peeling.\n");
2372               dump_printf_loc (MSG_NOTE, vect_location,
2373                                "Peeling for alignment will be applied.\n");
2374             }
2375 
2376 	  /* The inside-loop cost will be accounted for in vectorizable_load
2377 	     and vectorizable_store correctly with adjusted alignments.
2378 	     Drop the body_cst_vec on the floor here.  */
2379 	  return opt_result::success ();
2380         }
2381     }
2382 
2383   /* (2) Versioning to force alignment.  */
2384 
2385   /* Try versioning if:
2386      1) optimize loop for speed and the cost-model is not cheap
2387      2) there is at least one unsupported misaligned data ref with an unknown
2388         misalignment, and
2389      3) all misaligned data refs with a known misalignment are supported, and
2390      4) the number of runtime alignment checks is within reason.  */
2391 
2392   do_versioning
2393     = (optimize_loop_nest_for_speed_p (loop)
2394        && !loop->inner /* FORNOW */
2395        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2396 
2397   if (do_versioning)
2398     {
2399       FOR_EACH_VEC_ELT (datarefs, i, dr)
2400         {
2401 	  dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2402 	  if (!vect_relevant_for_alignment_p (dr_info))
2403 	    continue;
2404 
2405 	  stmt_vec_info stmt_info = dr_info->stmt;
2406 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2407 	    {
2408 	      do_versioning = false;
2409 	      break;
2410 	    }
2411 
2412 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2413 	  bool negative = tree_int_cst_compare (DR_STEP (dr),
2414 						size_zero_node) < 0;
2415 	  poly_int64 off = 0;
2416 	  if (negative)
2417 	    off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2418 		   * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2419 	  int misalignment;
2420 	  if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2421 	    continue;
2422 
2423 	  enum dr_alignment_support supportable_dr_alignment
2424 	    = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2425 					     misalignment);
2426 	  if (supportable_dr_alignment == dr_unaligned_unsupported)
2427             {
2428 	      if (misalignment != DR_MISALIGNMENT_UNKNOWN
2429 		  || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2430 		      >= (unsigned) param_vect_max_version_for_alignment_checks))
2431                 {
2432                   do_versioning = false;
2433                   break;
2434                 }
2435 
2436 	      /* At present we don't support versioning for alignment
2437 		 with variable VF, since there's no guarantee that the
2438 		 VF is a power of two.  We could relax this if we added
2439 		 a way of enforcing a power-of-two size.  */
2440 	      unsigned HOST_WIDE_INT size;
2441 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2442 		{
2443 		  do_versioning = false;
2444 		  break;
2445 		}
2446 
2447 	      /* Forcing alignment in the first iteration is no good if
2448 		 we don't keep it across iterations.  For now, just disable
2449 		 versioning in this case.
2450 		 ?? We could actually unroll the loop to achieve the required
2451 		 overall step alignment, and forcing the alignment could be
2452 		 done by doing some iterations of the non-vectorized loop.  */
2453 	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2454 			       * DR_STEP_ALIGNMENT (dr),
2455 			       DR_TARGET_ALIGNMENT (dr_info)))
2456 		{
2457 		  do_versioning = false;
2458 		  break;
2459 		}
2460 
2461               /* The rightmost bits of an aligned address must be zeros.
2462                  Construct the mask needed for this test.  For example,
2463                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2464                  mask must be 15 = 0xf. */
2465 	      int mask = size - 1;
2466 
2467 	      /* FORNOW: use the same mask to test all potentially unaligned
2468 		 references in the loop.  */
2469 	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2470 		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2471 		{
2472 		  do_versioning = false;
2473 		  break;
2474 		}
2475 
2476               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2477 	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2478             }
2479         }
2480 
2481       /* Versioning requires at least one misaligned data reference.  */
2482       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2483         do_versioning = false;
2484       else if (!do_versioning)
2485         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2486     }
2487 
2488   if (do_versioning)
2489     {
2490       const vec<stmt_vec_info> &may_misalign_stmts
2491 	= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2492       stmt_vec_info stmt_info;
2493 
2494       /* It can now be assumed that the data references in the statements
2495          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2496          of the loop being vectorized.  */
2497       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2498         {
2499 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2500 	  SET_DR_MISALIGNMENT (dr_info,
2501 			       vect_dr_misalign_for_aligned_access (dr_info));
2502 	  if (dump_enabled_p ())
2503             dump_printf_loc (MSG_NOTE, vect_location,
2504                              "Alignment of access forced using versioning.\n");
2505         }
2506 
2507       if (dump_enabled_p ())
2508         dump_printf_loc (MSG_NOTE, vect_location,
2509                          "Versioning for alignment will be applied.\n");
2510 
2511       /* Peeling and versioning can't be done together at this time.  */
2512       gcc_assert (! (do_peeling && do_versioning));
2513 
2514       return opt_result::success ();
2515     }
2516 
2517   /* This point is reached if neither peeling nor versioning is being done.  */
2518   gcc_assert (! (do_peeling || do_versioning));
2519 
2520   return opt_result::success ();
2521 }
2522 
2523 
2524 /* Function vect_analyze_data_refs_alignment
2525 
2526    Analyze the alignment of the data-references in the loop.
2527    Return FALSE if a data reference is found that cannot be vectorized.  */
2528 
2529 opt_result
vect_analyze_data_refs_alignment(loop_vec_info loop_vinfo)2530 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2531 {
2532   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2533 
2534   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2535   struct data_reference *dr;
2536   unsigned int i;
2537 
2538   vect_record_base_alignments (loop_vinfo);
2539   FOR_EACH_VEC_ELT (datarefs, i, dr)
2540     {
2541       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2542       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2543 	{
2544 	  if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2545 	      && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2546 	    continue;
2547 	  vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2548 					   STMT_VINFO_VECTYPE (dr_info->stmt));
2549 	}
2550     }
2551 
2552   return opt_result::success ();
2553 }
2554 
2555 
2556 /* Analyze alignment of DRs of stmts in NODE.  */
2557 
2558 static bool
vect_slp_analyze_node_alignment(vec_info * vinfo,slp_tree node)2559 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2560 {
2561   /* Alignment is maintained in the first element of the group.  */
2562   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2563   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2564   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2565   tree vectype = SLP_TREE_VECTYPE (node);
2566   poly_uint64 vector_alignment
2567     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2568 		 BITS_PER_UNIT);
2569   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2570     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2571   /* Re-analyze alignment when we're facing a vectorization with a bigger
2572      alignment requirement.  */
2573   else if (known_lt (dr_info->target_alignment, vector_alignment))
2574     {
2575       poly_uint64 old_target_alignment = dr_info->target_alignment;
2576       int old_misalignment = dr_info->misalignment;
2577       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2578       /* But keep knowledge about a smaller alignment.  */
2579       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2580 	  && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2581 	{
2582 	  dr_info->target_alignment = old_target_alignment;
2583 	  dr_info->misalignment = old_misalignment;
2584 	}
2585     }
2586   /* When we ever face unordered target alignments the first one wins in terms
2587      of analyzing and the other will become unknown in dr_misalignment.  */
2588   return true;
2589 }
2590 
2591 /* Function vect_slp_analyze_instance_alignment
2592 
2593    Analyze the alignment of the data-references in the SLP instance.
2594    Return FALSE if a data reference is found that cannot be vectorized.  */
2595 
2596 bool
vect_slp_analyze_instance_alignment(vec_info * vinfo,slp_instance instance)2597 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2598 						slp_instance instance)
2599 {
2600   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2601 
2602   slp_tree node;
2603   unsigned i;
2604   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2605     if (! vect_slp_analyze_node_alignment (vinfo, node))
2606       return false;
2607 
2608   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2609       && ! vect_slp_analyze_node_alignment
2610 	     (vinfo, SLP_INSTANCE_TREE (instance)))
2611     return false;
2612 
2613   return true;
2614 }
2615 
2616 
2617 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2618    accesses of legal size, step, etc.  Detect gaps, single element
2619    interleaving, and other special cases. Set grouped access info.
2620    Collect groups of strided stores for further use in SLP analysis.
2621    Worker for vect_analyze_group_access.  */
2622 
2623 static bool
vect_analyze_group_access_1(vec_info * vinfo,dr_vec_info * dr_info)2624 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2625 {
2626   data_reference *dr = dr_info->dr;
2627   tree step = DR_STEP (dr);
2628   tree scalar_type = TREE_TYPE (DR_REF (dr));
2629   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2630   stmt_vec_info stmt_info = dr_info->stmt;
2631   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2632   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2633   HOST_WIDE_INT dr_step = -1;
2634   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2635   bool slp_impossible = false;
2636 
2637   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2638      size of the interleaving group (including gaps).  */
2639   if (tree_fits_shwi_p (step))
2640     {
2641       dr_step = tree_to_shwi (step);
2642       /* Check that STEP is a multiple of type size.  Otherwise there is
2643          a non-element-sized gap at the end of the group which we
2644 	 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2645 	 ???  As we can handle non-constant step fine here we should
2646 	 simply remove uses of DR_GROUP_GAP between the last and first
2647 	 element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2648 	 simply not include that gap.  */
2649       if ((dr_step % type_size) != 0)
2650 	{
2651 	  if (dump_enabled_p ())
2652 	    dump_printf_loc (MSG_NOTE, vect_location,
2653 			     "Step %T is not a multiple of the element size"
2654 			     " for %T\n",
2655 			     step, DR_REF (dr));
2656 	  return false;
2657 	}
2658       groupsize = absu_hwi (dr_step) / type_size;
2659     }
2660   else
2661     groupsize = 0;
2662 
2663   /* Not consecutive access is possible only if it is a part of interleaving.  */
2664   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2665     {
2666       /* Check if it this DR is a part of interleaving, and is a single
2667 	 element of the group that is accessed in the loop.  */
2668 
2669       /* Gaps are supported only for loads. STEP must be a multiple of the type
2670 	 size.  */
2671       if (DR_IS_READ (dr)
2672 	  && (dr_step % type_size) == 0
2673 	  && groupsize > 0
2674 	  /* This could be UINT_MAX but as we are generating code in a very
2675 	     inefficient way we have to cap earlier.
2676 	     See PR91403 for example.  */
2677 	  && groupsize <= 4096)
2678 	{
2679 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2680 	  DR_GROUP_SIZE (stmt_info) = groupsize;
2681 	  DR_GROUP_GAP (stmt_info) = groupsize - 1;
2682 	  if (dump_enabled_p ())
2683 	    dump_printf_loc (MSG_NOTE, vect_location,
2684 			     "Detected single element interleaving %T"
2685 			     " step %T\n",
2686 			     DR_REF (dr), step);
2687 
2688 	  return true;
2689 	}
2690 
2691       if (dump_enabled_p ())
2692 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2693 			 "not consecutive access %G", stmt_info->stmt);
2694 
2695       if (bb_vinfo)
2696 	{
2697 	  /* Mark the statement as unvectorizable.  */
2698 	  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2699 	  return true;
2700 	}
2701 
2702       if (dump_enabled_p ())
2703 	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2704       STMT_VINFO_STRIDED_P (stmt_info) = true;
2705       return true;
2706     }
2707 
2708   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2709     {
2710       /* First stmt in the interleaving chain. Check the chain.  */
2711       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2712       struct data_reference *data_ref = dr;
2713       unsigned int count = 1;
2714       tree prev_init = DR_INIT (data_ref);
2715       HOST_WIDE_INT diff, gaps = 0;
2716 
2717       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2718       while (next)
2719         {
2720           /* We never have the same DR multiple times.  */
2721           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2722 				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2723 
2724 	  data_ref = STMT_VINFO_DATA_REF (next);
2725 
2726 	  /* All group members have the same STEP by construction.  */
2727 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2728 
2729           /* Check that the distance between two accesses is equal to the type
2730              size. Otherwise, we have gaps.  */
2731           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2732 		  - TREE_INT_CST_LOW (prev_init)) / type_size;
2733 	  if (diff < 1 || diff > UINT_MAX)
2734 	    {
2735 	      /* For artificial testcases with array accesses with large
2736 		 constant indices we can run into overflow issues which
2737 		 can end up fooling the groupsize constraint below so
2738 		 check the individual gaps (which are represented as
2739 		 unsigned int) as well.  */
2740 	      if (dump_enabled_p ())
2741 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742 				 "interleaved access with gap larger "
2743 				 "than representable\n");
2744 	      return false;
2745 	    }
2746 	  if (diff != 1)
2747 	    {
2748 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2749 	      slp_impossible = true;
2750 	      if (DR_IS_WRITE (data_ref))
2751 		{
2752                   if (dump_enabled_p ())
2753                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2754                                      "interleaved store with gaps\n");
2755 		  return false;
2756 		}
2757 
2758               gaps += diff - 1;
2759 	    }
2760 
2761 	  last_accessed_element += diff;
2762 
2763           /* Store the gap from the previous member of the group. If there is no
2764              gap in the access, DR_GROUP_GAP is always 1.  */
2765 	  DR_GROUP_GAP (next) = diff;
2766 
2767 	  prev_init = DR_INIT (data_ref);
2768 	  next = DR_GROUP_NEXT_ELEMENT (next);
2769 	  /* Count the number of data-refs in the chain.  */
2770 	  count++;
2771         }
2772 
2773       if (groupsize == 0)
2774         groupsize = count + gaps;
2775 
2776       /* This could be UINT_MAX but as we are generating code in a very
2777          inefficient way we have to cap earlier.  See PR78699 for example.  */
2778       if (groupsize > 4096)
2779 	{
2780 	  if (dump_enabled_p ())
2781 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 			     "group is too large\n");
2783 	  return false;
2784 	}
2785 
2786       /* Check that the size of the interleaving is equal to count for stores,
2787          i.e., that there are no gaps.  */
2788       if (groupsize != count
2789 	  && !DR_IS_READ (dr))
2790         {
2791 	  groupsize = count;
2792 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
2793 	}
2794 
2795       /* If there is a gap after the last load in the group it is the
2796 	 difference between the groupsize and the last accessed
2797 	 element.
2798 	 When there is no gap, this difference should be 0.  */
2799       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2800 
2801       DR_GROUP_SIZE (stmt_info) = groupsize;
2802       if (dump_enabled_p ())
2803 	{
2804 	  dump_printf_loc (MSG_NOTE, vect_location,
2805 			   "Detected interleaving ");
2806 	  if (DR_IS_READ (dr))
2807 	    dump_printf (MSG_NOTE, "load ");
2808 	  else if (STMT_VINFO_STRIDED_P (stmt_info))
2809 	    dump_printf (MSG_NOTE, "strided store ");
2810 	  else
2811 	    dump_printf (MSG_NOTE, "store ");
2812 	  dump_printf (MSG_NOTE, "of size %u\n",
2813 		       (unsigned)groupsize);
2814 	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2815 	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816 	  while (next)
2817 	    {
2818 	      if (DR_GROUP_GAP (next) != 1)
2819 		dump_printf_loc (MSG_NOTE, vect_location,
2820 				 "\t<gap of %d elements>\n",
2821 				 DR_GROUP_GAP (next) - 1);
2822 	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2823 	      next = DR_GROUP_NEXT_ELEMENT (next);
2824 	    }
2825 	  if (DR_GROUP_GAP (stmt_info) != 0)
2826 	    dump_printf_loc (MSG_NOTE, vect_location,
2827 			     "\t<gap of %d elements>\n",
2828 			     DR_GROUP_GAP (stmt_info));
2829 	}
2830 
2831       /* SLP: create an SLP data structure for every interleaving group of
2832 	 stores for further analysis in vect_analyse_slp.  */
2833       if (DR_IS_WRITE (dr) && !slp_impossible)
2834 	{
2835 	  if (loop_vinfo)
2836 	    LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2837 	  if (bb_vinfo)
2838 	    BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2839 	}
2840     }
2841 
2842   return true;
2843 }
2844 
2845 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2846    accesses of legal size, step, etc.  Detect gaps, single element
2847    interleaving, and other special cases. Set grouped access info.
2848    Collect groups of strided stores for further use in SLP analysis.  */
2849 
2850 static bool
vect_analyze_group_access(vec_info * vinfo,dr_vec_info * dr_info)2851 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2852 {
2853   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2854     {
2855       /* Dissolve the group if present.  */
2856       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2857       while (stmt_info)
2858 	{
2859 	  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2860 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2861 	  DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2862 	  stmt_info = next;
2863 	}
2864       return false;
2865     }
2866   return true;
2867 }
2868 
2869 /* Analyze the access pattern of the data-reference DR_INFO.
2870    In case of non-consecutive accesses call vect_analyze_group_access() to
2871    analyze groups of accesses.  */
2872 
2873 static bool
vect_analyze_data_ref_access(vec_info * vinfo,dr_vec_info * dr_info)2874 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2875 {
2876   data_reference *dr = dr_info->dr;
2877   tree step = DR_STEP (dr);
2878   tree scalar_type = TREE_TYPE (DR_REF (dr));
2879   stmt_vec_info stmt_info = dr_info->stmt;
2880   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2881   class loop *loop = NULL;
2882 
2883   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2884     return true;
2885 
2886   if (loop_vinfo)
2887     loop = LOOP_VINFO_LOOP (loop_vinfo);
2888 
2889   if (loop_vinfo && !step)
2890     {
2891       if (dump_enabled_p ())
2892 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893 	                 "bad data-ref access in loop\n");
2894       return false;
2895     }
2896 
2897   /* Allow loads with zero step in inner-loop vectorization.  */
2898   if (loop_vinfo && integer_zerop (step))
2899     {
2900       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2901       if (!nested_in_vect_loop_p (loop, stmt_info))
2902 	return DR_IS_READ (dr);
2903       /* Allow references with zero step for outer loops marked
2904 	 with pragma omp simd only - it guarantees absence of
2905 	 loop-carried dependencies between inner loop iterations.  */
2906       if (loop->safelen < 2)
2907 	{
2908 	  if (dump_enabled_p ())
2909 	    dump_printf_loc (MSG_NOTE, vect_location,
2910 			     "zero step in inner loop of nest\n");
2911 	  return false;
2912 	}
2913     }
2914 
2915   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2916     {
2917       /* Interleaved accesses are not yet supported within outer-loop
2918         vectorization for references in the inner-loop.  */
2919       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2920 
2921       /* For the rest of the analysis we use the outer-loop step.  */
2922       step = STMT_VINFO_DR_STEP (stmt_info);
2923       if (integer_zerop (step))
2924 	{
2925 	  if (dump_enabled_p ())
2926 	    dump_printf_loc (MSG_NOTE, vect_location,
2927 	                     "zero step in outer loop.\n");
2928 	  return DR_IS_READ (dr);
2929 	}
2930     }
2931 
2932   /* Consecutive?  */
2933   if (TREE_CODE (step) == INTEGER_CST)
2934     {
2935       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2936       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2937 	  || (dr_step < 0
2938 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2939 	{
2940 	  /* Mark that it is not interleaving.  */
2941 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2942 	  return true;
2943 	}
2944     }
2945 
2946   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2947     {
2948       if (dump_enabled_p ())
2949 	dump_printf_loc (MSG_NOTE, vect_location,
2950 	                 "grouped access in outer loop.\n");
2951       return false;
2952     }
2953 
2954 
2955   /* Assume this is a DR handled by non-constant strided load case.  */
2956   if (TREE_CODE (step) != INTEGER_CST)
2957     return (STMT_VINFO_STRIDED_P (stmt_info)
2958 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2959 		|| vect_analyze_group_access (vinfo, dr_info)));
2960 
2961   /* Not consecutive access - check if it's a part of interleaving group.  */
2962   return vect_analyze_group_access (vinfo, dr_info);
2963 }
2964 
2965 /* Compare two data-references DRA and DRB to group them into chunks
2966    suitable for grouping.  */
2967 
2968 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2969 dr_group_sort_cmp (const void *dra_, const void *drb_)
2970 {
2971   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2972   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2973   data_reference_p dra = dra_info->dr;
2974   data_reference_p drb = drb_info->dr;
2975   int cmp;
2976 
2977   /* Stabilize sort.  */
2978   if (dra == drb)
2979     return 0;
2980 
2981   /* Different group IDs lead never belong to the same group.  */
2982   if (dra_info->group != drb_info->group)
2983     return dra_info->group < drb_info->group ? -1 : 1;
2984 
2985   /* Ordering of DRs according to base.  */
2986   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2987 			       DR_BASE_ADDRESS (drb));
2988   if (cmp != 0)
2989     return cmp;
2990 
2991   /* And according to DR_OFFSET.  */
2992   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2993   if (cmp != 0)
2994     return cmp;
2995 
2996   /* Put reads before writes.  */
2997   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2998     return DR_IS_READ (dra) ? -1 : 1;
2999 
3000   /* Then sort after access size.  */
3001   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3002 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3003   if (cmp != 0)
3004     return cmp;
3005 
3006   /* And after step.  */
3007   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3008   if (cmp != 0)
3009     return cmp;
3010 
3011   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3012   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3013   if (cmp == 0)
3014     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3015   return cmp;
3016 }
3017 
3018 /* If OP is the result of a conversion, return the unconverted value,
3019    otherwise return null.  */
3020 
3021 static tree
strip_conversion(tree op)3022 strip_conversion (tree op)
3023 {
3024   if (TREE_CODE (op) != SSA_NAME)
3025     return NULL_TREE;
3026   gimple *stmt = SSA_NAME_DEF_STMT (op);
3027   if (!is_gimple_assign (stmt)
3028       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3029     return NULL_TREE;
3030   return gimple_assign_rhs1 (stmt);
3031 }
3032 
3033 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3034    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3035    be grouped in SLP mode.  */
3036 
3037 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info,bool allow_slp_p)3038 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3039 		   bool allow_slp_p)
3040 {
3041   if (gimple_assign_single_p (stmt1_info->stmt))
3042     return gimple_assign_single_p (stmt2_info->stmt);
3043 
3044   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3045   if (call1 && gimple_call_internal_p (call1))
3046     {
3047       /* Check for two masked loads or two masked stores.  */
3048       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3049       if (!call2 || !gimple_call_internal_p (call2))
3050 	return false;
3051       internal_fn ifn = gimple_call_internal_fn (call1);
3052       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3053 	return false;
3054       if (ifn != gimple_call_internal_fn (call2))
3055 	return false;
3056 
3057       /* Check that the masks are the same.  Cope with casts of masks,
3058 	 like those created by build_mask_conversion.  */
3059       tree mask1 = gimple_call_arg (call1, 2);
3060       tree mask2 = gimple_call_arg (call2, 2);
3061       if (!operand_equal_p (mask1, mask2, 0)
3062           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3063 	{
3064 	  mask1 = strip_conversion (mask1);
3065 	  if (!mask1)
3066 	    return false;
3067 	  mask2 = strip_conversion (mask2);
3068 	  if (!mask2)
3069 	    return false;
3070 	  if (!operand_equal_p (mask1, mask2, 0))
3071 	    return false;
3072 	}
3073       return true;
3074     }
3075 
3076   return false;
3077 }
3078 
3079 /* Function vect_analyze_data_ref_accesses.
3080 
3081    Analyze the access pattern of all the data references in the loop.
3082 
3083    FORNOW: the only access pattern that is considered vectorizable is a
3084 	   simple step 1 (consecutive) access.
3085 
3086    FORNOW: handle only arrays and pointer accesses.  */
3087 
3088 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo,vec<int> * dataref_groups)3089 vect_analyze_data_ref_accesses (vec_info *vinfo,
3090 				vec<int> *dataref_groups)
3091 {
3092   unsigned int i;
3093   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3094 
3095   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3096 
3097   if (datarefs.is_empty ())
3098     return opt_result::success ();
3099 
3100   /* Sort the array of datarefs to make building the interleaving chains
3101      linear.  Don't modify the original vector's order, it is needed for
3102      determining what dependencies are reversed.  */
3103   vec<dr_vec_info *> datarefs_copy;
3104   datarefs_copy.create (datarefs.length ());
3105   for (unsigned i = 0; i < datarefs.length (); i++)
3106     {
3107       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3108       /* If the caller computed DR grouping use that, otherwise group by
3109 	 basic blocks.  */
3110       if (dataref_groups)
3111 	dr_info->group = (*dataref_groups)[i];
3112       else
3113 	dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3114       datarefs_copy.quick_push (dr_info);
3115     }
3116   datarefs_copy.qsort (dr_group_sort_cmp);
3117   hash_set<stmt_vec_info> to_fixup;
3118 
3119   /* Build the interleaving chains.  */
3120   for (i = 0; i < datarefs_copy.length () - 1;)
3121     {
3122       dr_vec_info *dr_info_a = datarefs_copy[i];
3123       data_reference_p dra = dr_info_a->dr;
3124       int dra_group_id = dr_info_a->group;
3125       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3126       stmt_vec_info lastinfo = NULL;
3127       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3128 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3129 	{
3130 	  ++i;
3131 	  continue;
3132 	}
3133       for (i = i + 1; i < datarefs_copy.length (); ++i)
3134 	{
3135 	  dr_vec_info *dr_info_b = datarefs_copy[i];
3136 	  data_reference_p drb = dr_info_b->dr;
3137 	  int drb_group_id = dr_info_b->group;
3138 	  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3139 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3140 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3141 	    break;
3142 
3143 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
3144 	     accesses, same accesses) can lead to a group to be artificially
3145 	     split here as we don't just skip over those.  If it really
3146 	     matters we can push those to a worklist and re-iterate
3147 	     over them.  The we can just skip ahead to the next DR here.  */
3148 
3149 	  /* DRs in a different DR group should not be put into the same
3150 	     interleaving group.  */
3151 	  if (dra_group_id != drb_group_id)
3152 	    break;
3153 
3154 	  /* Check that the data-refs have same first location (except init)
3155 	     and they are both either store or load (not load and store,
3156 	     not masked loads or stores).  */
3157 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
3158 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3159 					DR_BASE_ADDRESS (drb)) != 0
3160 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3161 	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3162 	    break;
3163 
3164 	  /* Check that the data-refs have the same constant size.  */
3165 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3166 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3167 	  if (!tree_fits_uhwi_p (sza)
3168 	      || !tree_fits_uhwi_p (szb)
3169 	      || !tree_int_cst_equal (sza, szb))
3170 	    break;
3171 
3172 	  /* Check that the data-refs have the same step.  */
3173 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3174 	    break;
3175 
3176 	  /* Check the types are compatible.
3177 	     ???  We don't distinguish this during sorting.  */
3178 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3179 				   TREE_TYPE (DR_REF (drb))))
3180 	    break;
3181 
3182 	  /* Check that the DR_INITs are compile-time constants.  */
3183 	  if (!tree_fits_shwi_p (DR_INIT (dra))
3184 	      || !tree_fits_shwi_p (DR_INIT (drb)))
3185 	    break;
3186 
3187 	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
3188 	     just hold extra information.  */
3189 	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3190 	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3191 	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3192 	    break;
3193 
3194 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3195 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3196 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3197 	  HOST_WIDE_INT init_prev
3198 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3199 	  gcc_assert (init_a <= init_b
3200 		      && init_a <= init_prev
3201 		      && init_prev <= init_b);
3202 
3203 	  /* Do not place the same access in the interleaving chain twice.  */
3204 	  if (init_b == init_prev)
3205 	    {
3206 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3207 			  < gimple_uid (DR_STMT (drb)));
3208 	      /* Simply link in duplicates and fix up the chain below.  */
3209 	    }
3210 	  else
3211 	    {
3212 	      /* If init_b == init_a + the size of the type * k, we have an
3213 		 interleaving, and DRA is accessed before DRB.  */
3214 	      unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3215 	      if (type_size_a == 0
3216 		  || (((unsigned HOST_WIDE_INT)init_b - init_a)
3217 		      % type_size_a != 0))
3218 		break;
3219 
3220 	      /* If we have a store, the accesses are adjacent.  This splits
3221 		 groups into chunks we support (we don't support vectorization
3222 		 of stores with gaps).  */
3223 	      if (!DR_IS_READ (dra)
3224 		  && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3225 		      != type_size_a))
3226 		break;
3227 
3228 	      /* If the step (if not zero or non-constant) is smaller than the
3229 		 difference between data-refs' inits this splits groups into
3230 		 suitable sizes.  */
3231 	      if (tree_fits_shwi_p (DR_STEP (dra)))
3232 		{
3233 		  unsigned HOST_WIDE_INT step
3234 		    = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3235 		  if (step != 0
3236 		      && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3237 		    break;
3238 		}
3239 	    }
3240 
3241 	  if (dump_enabled_p ())
3242 	    dump_printf_loc (MSG_NOTE, vect_location,
3243 			     DR_IS_READ (dra)
3244 			     ? "Detected interleaving load %T and %T\n"
3245 			     : "Detected interleaving store %T and %T\n",
3246 			     DR_REF (dra), DR_REF (drb));
3247 
3248 	  /* Link the found element into the group list.  */
3249 	  if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3250 	    {
3251 	      DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3252 	      lastinfo = stmtinfo_a;
3253 	    }
3254 	  DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3255 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3256 	  lastinfo = stmtinfo_b;
3257 
3258 	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3259 	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3260 
3261 	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3262 	    dump_printf_loc (MSG_NOTE, vect_location,
3263 			     "Load suitable for SLP vectorization only.\n");
3264 
3265 	  if (init_b == init_prev
3266 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3267 	      && dump_enabled_p ())
3268 	    dump_printf_loc (MSG_NOTE, vect_location,
3269 			     "Queuing group with duplicate access for fixup\n");
3270 	}
3271     }
3272 
3273   /* Fixup groups with duplicate entries by splitting it.  */
3274   while (1)
3275     {
3276       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3277       if (!(it != to_fixup.end ()))
3278 	break;
3279       stmt_vec_info grp = *it;
3280       to_fixup.remove (grp);
3281 
3282       /* Find the earliest duplicate group member.  */
3283       unsigned first_duplicate = -1u;
3284       stmt_vec_info next, g = grp;
3285       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3286 	{
3287 	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3288 				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3289 	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3290 	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3291 	  g = next;
3292 	}
3293       if (first_duplicate == -1U)
3294 	continue;
3295 
3296       /* Then move all stmts after the first duplicate to a new group.
3297          Note this is a heuristic but one with the property that *it
3298 	 is fixed up completely.  */
3299       g = grp;
3300       stmt_vec_info newgroup = NULL, ng = grp;
3301       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3302 	{
3303 	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3304 	    {
3305 	      DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3306 	      if (!newgroup)
3307 		newgroup = next;
3308 	      else
3309 		DR_GROUP_NEXT_ELEMENT (ng) = next;
3310 	      ng = next;
3311 	      DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3312 	    }
3313 	  else
3314 	    g = DR_GROUP_NEXT_ELEMENT (g);
3315 	}
3316       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3317 
3318       /* Fixup the new group which still may contain duplicates.  */
3319       to_fixup.add (newgroup);
3320     }
3321 
3322   dr_vec_info *dr_info;
3323   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3324     {
3325       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3326 	  && !vect_analyze_data_ref_access (vinfo, dr_info))
3327 	{
3328 	  if (dump_enabled_p ())
3329 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3330 			     "not vectorized: complicated access pattern.\n");
3331 
3332 	  if (is_a <bb_vec_info> (vinfo))
3333 	    {
3334 	      /* Mark the statement as not vectorizable.  */
3335 	      STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3336 	      continue;
3337 	    }
3338 	  else
3339 	    {
3340 	      datarefs_copy.release ();
3341 	      return opt_result::failure_at (dr_info->stmt->stmt,
3342 					     "not vectorized:"
3343 					     " complicated access pattern.\n");
3344 	    }
3345 	}
3346     }
3347 
3348   datarefs_copy.release ();
3349   return opt_result::success ();
3350 }
3351 
3352 /* Function vect_vfa_segment_size.
3353 
3354    Input:
3355      DR_INFO: The data reference.
3356      LENGTH_FACTOR: segment length to consider.
3357 
3358    Return a value suitable for the dr_with_seg_len::seg_len field.
3359    This is the "distance travelled" by the pointer from the first
3360    iteration in the segment to the last.  Note that it does not include
3361    the size of the access; in effect it only describes the first byte.  */
3362 
3363 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3364 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3365 {
3366   length_factor = size_binop (MINUS_EXPR,
3367 			      fold_convert (sizetype, length_factor),
3368 			      size_one_node);
3369   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3370 		     length_factor);
3371 }
3372 
3373 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3374    gives the worst-case number of bytes covered by the segment.  */
3375 
3376 static unsigned HOST_WIDE_INT
vect_vfa_access_size(vec_info * vinfo,dr_vec_info * dr_info)3377 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3378 {
3379   stmt_vec_info stmt_vinfo = dr_info->stmt;
3380   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3381   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3382   unsigned HOST_WIDE_INT access_size = ref_size;
3383   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3384     {
3385       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3386       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3387     }
3388   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3389   int misalignment;
3390   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3391       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3392       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3393 	  == dr_explicit_realign_optimized))
3394     {
3395       /* We might access a full vector's worth.  */
3396       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3397     }
3398   return access_size;
3399 }
3400 
3401 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3402    describes.  */
3403 
3404 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3405 vect_vfa_align (dr_vec_info *dr_info)
3406 {
3407   return dr_alignment (dr_info->dr);
3408 }
3409 
3410 /* Function vect_no_alias_p.
3411 
3412    Given data references A and B with equal base and offset, see whether
3413    the alias relation can be decided at compilation time.  Return 1 if
3414    it can and the references alias, 0 if it can and the references do
3415    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3416    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3417    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3418 
3419 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3420 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3421 			 tree segment_length_a, tree segment_length_b,
3422 			 unsigned HOST_WIDE_INT access_size_a,
3423 			 unsigned HOST_WIDE_INT access_size_b)
3424 {
3425   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3426   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3427   poly_uint64 const_length_a;
3428   poly_uint64 const_length_b;
3429 
3430   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3431      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3432      [a, a+12) */
3433   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3434     {
3435       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3436       offset_a -= const_length_a;
3437     }
3438   else
3439     const_length_a = tree_to_poly_uint64 (segment_length_a);
3440   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3441     {
3442       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3443       offset_b -= const_length_b;
3444     }
3445   else
3446     const_length_b = tree_to_poly_uint64 (segment_length_b);
3447 
3448   const_length_a += access_size_a;
3449   const_length_b += access_size_b;
3450 
3451   if (ranges_known_overlap_p (offset_a, const_length_a,
3452 			      offset_b, const_length_b))
3453     return 1;
3454 
3455   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3456 			       offset_b, const_length_b))
3457     return 0;
3458 
3459   return -1;
3460 }
3461 
3462 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3463    in DDR is >= VF.  */
3464 
3465 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3466 dependence_distance_ge_vf (data_dependence_relation *ddr,
3467 			   unsigned int loop_depth, poly_uint64 vf)
3468 {
3469   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3470       || DDR_NUM_DIST_VECTS (ddr) == 0)
3471     return false;
3472 
3473   /* If the dependence is exact, we should have limited the VF instead.  */
3474   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3475 
3476   unsigned int i;
3477   lambda_vector dist_v;
3478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3479     {
3480       HOST_WIDE_INT dist = dist_v[loop_depth];
3481       if (dist != 0
3482 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3483 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3484 	return false;
3485     }
3486 
3487   if (dump_enabled_p ())
3488     dump_printf_loc (MSG_NOTE, vect_location,
3489 		     "dependence distance between %T and %T is >= VF\n",
3490 		     DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3491 
3492   return true;
3493 }
3494 
3495 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3496 
3497 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3498 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3499 {
3500   dump_printf (dump_kind, "%s (%T) >= ",
3501 	       lower_bound.unsigned_p ? "unsigned" : "abs",
3502 	       lower_bound.expr);
3503   dump_dec (dump_kind, lower_bound.min_value);
3504 }
3505 
3506 /* Record that the vectorized loop requires the vec_lower_bound described
3507    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3508 
3509 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3510 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3511 			poly_uint64 min_value)
3512 {
3513   vec<vec_lower_bound> &lower_bounds
3514     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3515   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3516     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3517       {
3518 	unsigned_p &= lower_bounds[i].unsigned_p;
3519 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3520 	if (lower_bounds[i].unsigned_p != unsigned_p
3521 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3522 	  {
3523 	    lower_bounds[i].unsigned_p = unsigned_p;
3524 	    lower_bounds[i].min_value = min_value;
3525 	    if (dump_enabled_p ())
3526 	      {
3527 		dump_printf_loc (MSG_NOTE, vect_location,
3528 				 "updating run-time check to ");
3529 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3530 		dump_printf (MSG_NOTE, "\n");
3531 	      }
3532 	  }
3533 	return;
3534       }
3535 
3536   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3537   if (dump_enabled_p ())
3538     {
3539       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3540       dump_lower_bound (MSG_NOTE, lower_bound);
3541       dump_printf (MSG_NOTE, "\n");
3542     }
3543   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3544 }
3545 
3546 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3547    will span fewer than GAP bytes.  */
3548 
3549 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3550 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3551 		  poly_int64 gap)
3552 {
3553   stmt_vec_info stmt_info = dr_info->stmt;
3554   HOST_WIDE_INT count
3555     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3556   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3557     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3558   return (estimated_poly_value (gap)
3559 	  <= count * vect_get_scalar_dr_size (dr_info));
3560 }
3561 
3562 /* Return true if we know that there is no alias between DR_INFO_A and
3563    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3564    When returning true, set *LOWER_BOUND_OUT to this N.  */
3565 
3566 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3567 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3568 				poly_uint64 *lower_bound_out)
3569 {
3570   /* Check that there is a constant gap of known sign between DR_A
3571      and DR_B.  */
3572   data_reference *dr_a = dr_info_a->dr;
3573   data_reference *dr_b = dr_info_b->dr;
3574   poly_int64 init_a, init_b;
3575   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3576       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3577       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3578       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3579       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3580       || !ordered_p (init_a, init_b))
3581     return false;
3582 
3583   /* Sort DR_A and DR_B by the address they access.  */
3584   if (maybe_lt (init_b, init_a))
3585     {
3586       std::swap (init_a, init_b);
3587       std::swap (dr_info_a, dr_info_b);
3588       std::swap (dr_a, dr_b);
3589     }
3590 
3591   /* If the two accesses could be dependent within a scalar iteration,
3592      make sure that we'd retain their order.  */
3593   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3594       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3595     return false;
3596 
3597   /* There is no alias if abs (DR_STEP) is greater than or equal to
3598      the bytes spanned by the combination of the two accesses.  */
3599   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3600   return true;
3601 }
3602 
3603 /* Function vect_prune_runtime_alias_test_list.
3604 
3605    Prune a list of ddrs to be tested at run-time by versioning for alias.
3606    Merge several alias checks into one if possible.
3607    Return FALSE if resulting list of ddrs is longer then allowed by
3608    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3609 
3610 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3611 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3612 {
3613   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3614   hash_set <tree_pair_hash> compared_objects;
3615 
3616   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3617   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3618     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3619   const vec<vec_object_pair> &check_unequal_addrs
3620     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3621   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3622   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3623 
3624   ddr_p ddr;
3625   unsigned int i;
3626   tree length_factor;
3627 
3628   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3629 
3630   /* Step values are irrelevant for aliasing if the number of vector
3631      iterations is equal to the number of scalar iterations (which can
3632      happen for fully-SLP loops).  */
3633   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3634 
3635   if (!vf_one_p)
3636     {
3637       /* Convert the checks for nonzero steps into bound tests.  */
3638       tree value;
3639       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3640 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3641     }
3642 
3643   if (may_alias_ddrs.is_empty ())
3644     return opt_result::success ();
3645 
3646   comp_alias_ddrs.create (may_alias_ddrs.length ());
3647 
3648   unsigned int loop_depth
3649     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3650 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3651 
3652   /* First, we collect all data ref pairs for aliasing checks.  */
3653   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3654     {
3655       poly_uint64 lower_bound;
3656       tree segment_length_a, segment_length_b;
3657       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3658       unsigned int align_a, align_b;
3659 
3660       /* Ignore the alias if the VF we chose ended up being no greater
3661 	 than the dependence distance.  */
3662       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3663 	continue;
3664 
3665       if (DDR_OBJECT_A (ddr))
3666 	{
3667 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3668 	  if (!compared_objects.add (new_pair))
3669 	    {
3670 	      if (dump_enabled_p ())
3671 		dump_printf_loc (MSG_NOTE, vect_location,
3672 				 "checking that %T and %T"
3673 				 " have different addresses\n",
3674 				 new_pair.first, new_pair.second);
3675 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3676 	    }
3677 	  continue;
3678 	}
3679 
3680       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3681       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3682 
3683       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3684       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3685 
3686       bool preserves_scalar_order_p
3687 	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3688       bool ignore_step_p
3689 	  = (vf_one_p
3690 	     && (preserves_scalar_order_p
3691 		 || operand_equal_p (DR_STEP (dr_info_a->dr),
3692 				     DR_STEP (dr_info_b->dr))));
3693 
3694       /* Skip the pair if inter-iteration dependencies are irrelevant
3695 	 and intra-iteration dependencies are guaranteed to be honored.  */
3696       if (ignore_step_p
3697 	  && (preserves_scalar_order_p
3698 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3699 						 &lower_bound)))
3700 	{
3701 	  if (dump_enabled_p ())
3702 	    dump_printf_loc (MSG_NOTE, vect_location,
3703 			     "no need for alias check between "
3704 			     "%T and %T when VF is 1\n",
3705 			     DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3706 	  continue;
3707 	}
3708 
3709       /* See whether we can handle the alias using a bounds check on
3710 	 the step, and whether that's likely to be the best approach.
3711 	 (It might not be, for example, if the minimum step is much larger
3712 	 than the number of bytes handled by one vector iteration.)  */
3713       if (!ignore_step_p
3714 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3715 	  && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3716 					     &lower_bound)
3717 	  && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3718 	      || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3719 	{
3720 	  bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3721 	  if (dump_enabled_p ())
3722 	    {
3723 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3724 			       "%T and %T when the step %T is outside ",
3725 			       DR_REF (dr_info_a->dr),
3726 			       DR_REF (dr_info_b->dr),
3727 			       DR_STEP (dr_info_a->dr));
3728 	      if (unsigned_p)
3729 		dump_printf (MSG_NOTE, "[0");
3730 	      else
3731 		{
3732 		  dump_printf (MSG_NOTE, "(");
3733 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3734 		}
3735 	      dump_printf (MSG_NOTE, ", ");
3736 	      dump_dec (MSG_NOTE, lower_bound);
3737 	      dump_printf (MSG_NOTE, ")\n");
3738 	    }
3739 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3740 				  unsigned_p, lower_bound);
3741 	  continue;
3742 	}
3743 
3744       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3745       if (dr_group_first_a)
3746 	{
3747 	  stmt_info_a = dr_group_first_a;
3748 	  dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3749 	}
3750 
3751       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3752       if (dr_group_first_b)
3753 	{
3754 	  stmt_info_b = dr_group_first_b;
3755 	  dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3756 	}
3757 
3758       if (ignore_step_p)
3759 	{
3760 	  segment_length_a = size_zero_node;
3761 	  segment_length_b = size_zero_node;
3762 	}
3763       else
3764 	{
3765 	  if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3766 				DR_STEP (dr_info_b->dr), 0))
3767 	    length_factor = scalar_loop_iters;
3768 	  else
3769 	    length_factor = size_int (vect_factor);
3770 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3771 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3772 	}
3773       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3774       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3775       align_a = vect_vfa_align (dr_info_a);
3776       align_b = vect_vfa_align (dr_info_b);
3777 
3778       /* See whether the alias is known at compilation time.  */
3779       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3780 			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
3781 	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3782 			      DR_OFFSET (dr_info_b->dr), 0)
3783 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3784 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3785 	  && poly_int_tree_p (segment_length_a)
3786 	  && poly_int_tree_p (segment_length_b))
3787 	{
3788 	  int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3789 					     segment_length_a,
3790 					     segment_length_b,
3791 					     access_size_a,
3792 					     access_size_b);
3793 	  if (res >= 0 && dump_enabled_p ())
3794 	    {
3795 	      dump_printf_loc (MSG_NOTE, vect_location,
3796 			       "can tell at compile time that %T and %T",
3797 			       DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3798 	      if (res == 0)
3799 		dump_printf (MSG_NOTE, " do not alias\n");
3800 	      else
3801 		dump_printf (MSG_NOTE, " alias\n");
3802 	    }
3803 
3804 	  if (res == 0)
3805 	    continue;
3806 
3807 	  if (res == 1)
3808 	    return opt_result::failure_at (stmt_info_b->stmt,
3809 					   "not vectorized:"
3810 					   " compilation time alias: %G%G",
3811 					   stmt_info_a->stmt,
3812 					   stmt_info_b->stmt);
3813 	}
3814 
3815       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3816 			    access_size_a, align_a);
3817       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3818 			    access_size_b, align_b);
3819       /* Canonicalize the order to be the one that's needed for accurate
3820 	 RAW, WAR and WAW flags, in cases where the data references are
3821 	 well-ordered.  The order doesn't really matter otherwise,
3822 	 but we might as well be consistent.  */
3823       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3824 	std::swap (dr_a, dr_b);
3825 
3826       dr_with_seg_len_pair_t dr_with_seg_len_pair
3827 	(dr_a, dr_b, (preserves_scalar_order_p
3828 		      ? dr_with_seg_len_pair_t::WELL_ORDERED
3829 		      : dr_with_seg_len_pair_t::REORDERED));
3830 
3831       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3832     }
3833 
3834   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3835 
3836   unsigned int count = (comp_alias_ddrs.length ()
3837 			+ check_unequal_addrs.length ());
3838 
3839   if (count
3840       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3841 	  == VECT_COST_MODEL_VERY_CHEAP))
3842     return opt_result::failure_at
3843       (vect_location, "would need a runtime alias check\n");
3844 
3845   if (dump_enabled_p ())
3846     dump_printf_loc (MSG_NOTE, vect_location,
3847 		     "improved number of alias checks from %d to %d\n",
3848 		     may_alias_ddrs.length (), count);
3849   unsigned limit = param_vect_max_version_for_alias_checks;
3850   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3851     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3852   if (count > limit)
3853     return opt_result::failure_at
3854       (vect_location,
3855        "number of versioning for alias run-time tests exceeds %d "
3856        "(--param vect-max-version-for-alias-checks)\n", limit);
3857 
3858   return opt_result::success ();
3859 }
3860 
3861 /* Check whether we can use an internal function for a gather load
3862    or scatter store.  READ_P is true for loads and false for stores.
3863    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3864    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3865    is the type of the offset that is being applied to the invariant
3866    base address.  SCALE is the amount by which the offset should
3867    be multiplied *after* it has been converted to address width.
3868 
3869    Return true if the function is supported, storing the function id in
3870    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3871 
3872 bool
vect_gather_scatter_fn_p(vec_info * vinfo,bool read_p,bool masked_p,tree vectype,tree memory_type,tree offset_type,int scale,internal_fn * ifn_out,tree * offset_vectype_out)3873 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3874 			  tree vectype, tree memory_type, tree offset_type,
3875 			  int scale, internal_fn *ifn_out,
3876 			  tree *offset_vectype_out)
3877 {
3878   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3879   unsigned int element_bits = vector_element_bits (vectype);
3880   if (element_bits != memory_bits)
3881     /* For now the vector elements must be the same width as the
3882        memory elements.  */
3883     return false;
3884 
3885   /* Work out which function we need.  */
3886   internal_fn ifn, alt_ifn;
3887   if (read_p)
3888     {
3889       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3890       alt_ifn = IFN_MASK_GATHER_LOAD;
3891     }
3892   else
3893     {
3894       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3895       alt_ifn = IFN_MASK_SCATTER_STORE;
3896     }
3897 
3898   for (;;)
3899     {
3900       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3901       if (!offset_vectype)
3902 	return false;
3903 
3904       /* Test whether the target supports this combination.  */
3905       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3906 						  offset_vectype, scale))
3907 	{
3908 	  *ifn_out = ifn;
3909 	  *offset_vectype_out = offset_vectype;
3910 	  return true;
3911 	}
3912       else if (!masked_p
3913 	       && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3914 							  memory_type,
3915 							  offset_vectype,
3916 							  scale))
3917 	{
3918 	  *ifn_out = alt_ifn;
3919 	  *offset_vectype_out = offset_vectype;
3920 	  return true;
3921 	}
3922 
3923       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3924 	  && TYPE_PRECISION (offset_type) >= element_bits)
3925 	return false;
3926 
3927       offset_type = build_nonstandard_integer_type
3928 	(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3929     }
3930 }
3931 
3932 /* STMT_INFO is a call to an internal gather load or scatter store function.
3933    Describe the operation in INFO.  */
3934 
3935 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3936 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3937 				   gather_scatter_info *info)
3938 {
3939   gcall *call = as_a <gcall *> (stmt_info->stmt);
3940   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3941   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3942 
3943   info->ifn = gimple_call_internal_fn (call);
3944   info->decl = NULL_TREE;
3945   info->base = gimple_call_arg (call, 0);
3946   info->offset = gimple_call_arg (call, 1);
3947   info->offset_dt = vect_unknown_def_type;
3948   info->offset_vectype = NULL_TREE;
3949   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3950   info->element_type = TREE_TYPE (vectype);
3951   info->memory_type = TREE_TYPE (DR_REF (dr));
3952 }
3953 
3954 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3955    gather load or scatter store.  Describe the operation in *INFO if so.  */
3956 
3957 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3958 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3959 			   gather_scatter_info *info)
3960 {
3961   HOST_WIDE_INT scale = 1;
3962   poly_int64 pbitpos, pbitsize;
3963   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3964   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3965   tree offtype = NULL_TREE;
3966   tree decl = NULL_TREE, base, off;
3967   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3968   tree memory_type = TREE_TYPE (DR_REF (dr));
3969   machine_mode pmode;
3970   int punsignedp, reversep, pvolatilep = 0;
3971   internal_fn ifn;
3972   tree offset_vectype;
3973   bool masked_p = false;
3974 
3975   /* See whether this is already a call to a gather/scatter internal function.
3976      If not, see whether it's a masked load or store.  */
3977   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3978   if (call && gimple_call_internal_p (call))
3979     {
3980       ifn = gimple_call_internal_fn (call);
3981       if (internal_gather_scatter_fn_p (ifn))
3982 	{
3983 	  vect_describe_gather_scatter_call (stmt_info, info);
3984 	  return true;
3985 	}
3986       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3987     }
3988 
3989   /* True if we should aim to use internal functions rather than
3990      built-in functions.  */
3991   bool use_ifn_p = (DR_IS_READ (dr)
3992 		    ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3993 		    : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3994 
3995   base = DR_REF (dr);
3996   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3997      see if we can use the def stmt of the address.  */
3998   if (masked_p
3999       && TREE_CODE (base) == MEM_REF
4000       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4001       && integer_zerop (TREE_OPERAND (base, 1))
4002       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4003     {
4004       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4005       if (is_gimple_assign (def_stmt)
4006 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4007 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4008     }
4009 
4010   /* The gather and scatter builtins need address of the form
4011      loop_invariant + vector * {1, 2, 4, 8}
4012      or
4013      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4014      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4015      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4016      multiplications and additions in it.  To get a vector, we need
4017      a single SSA_NAME that will be defined in the loop and will
4018      contain everything that is not loop invariant and that can be
4019      vectorized.  The following code attempts to find such a preexistng
4020      SSA_NAME OFF and put the loop invariants into a tree BASE
4021      that can be gimplified before the loop.  */
4022   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4023 			      &punsignedp, &reversep, &pvolatilep);
4024   if (reversep)
4025     return false;
4026 
4027   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4028 
4029   if (TREE_CODE (base) == MEM_REF)
4030     {
4031       if (!integer_zerop (TREE_OPERAND (base, 1)))
4032 	{
4033 	  if (off == NULL_TREE)
4034 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4035 	  else
4036 	    off = size_binop (PLUS_EXPR, off,
4037 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
4038 	}
4039       base = TREE_OPERAND (base, 0);
4040     }
4041   else
4042     base = build_fold_addr_expr (base);
4043 
4044   if (off == NULL_TREE)
4045     off = size_zero_node;
4046 
4047   /* If base is not loop invariant, either off is 0, then we start with just
4048      the constant offset in the loop invariant BASE and continue with base
4049      as OFF, otherwise give up.
4050      We could handle that case by gimplifying the addition of base + off
4051      into some SSA_NAME and use that as off, but for now punt.  */
4052   if (!expr_invariant_in_loop_p (loop, base))
4053     {
4054       if (!integer_zerop (off))
4055 	return false;
4056       off = base;
4057       base = size_int (pbytepos);
4058     }
4059   /* Otherwise put base + constant offset into the loop invariant BASE
4060      and continue with OFF.  */
4061   else
4062     {
4063       base = fold_convert (sizetype, base);
4064       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4065     }
4066 
4067   /* OFF at this point may be either a SSA_NAME or some tree expression
4068      from get_inner_reference.  Try to peel off loop invariants from it
4069      into BASE as long as possible.  */
4070   STRIP_NOPS (off);
4071   while (offtype == NULL_TREE)
4072     {
4073       enum tree_code code;
4074       tree op0, op1, add = NULL_TREE;
4075 
4076       if (TREE_CODE (off) == SSA_NAME)
4077 	{
4078 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4079 
4080 	  if (expr_invariant_in_loop_p (loop, off))
4081 	    return false;
4082 
4083 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4084 	    break;
4085 
4086 	  op0 = gimple_assign_rhs1 (def_stmt);
4087 	  code = gimple_assign_rhs_code (def_stmt);
4088 	  op1 = gimple_assign_rhs2 (def_stmt);
4089 	}
4090       else
4091 	{
4092 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4093 	    return false;
4094 	  code = TREE_CODE (off);
4095 	  extract_ops_from_tree (off, &code, &op0, &op1);
4096 	}
4097       switch (code)
4098 	{
4099 	case POINTER_PLUS_EXPR:
4100 	case PLUS_EXPR:
4101 	  if (expr_invariant_in_loop_p (loop, op0))
4102 	    {
4103 	      add = op0;
4104 	      off = op1;
4105 	    do_add:
4106 	      add = fold_convert (sizetype, add);
4107 	      if (scale != 1)
4108 		add = size_binop (MULT_EXPR, add, size_int (scale));
4109 	      base = size_binop (PLUS_EXPR, base, add);
4110 	      continue;
4111 	    }
4112 	  if (expr_invariant_in_loop_p (loop, op1))
4113 	    {
4114 	      add = op1;
4115 	      off = op0;
4116 	      goto do_add;
4117 	    }
4118 	  break;
4119 	case MINUS_EXPR:
4120 	  if (expr_invariant_in_loop_p (loop, op1))
4121 	    {
4122 	      add = fold_convert (sizetype, op1);
4123 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
4124 	      off = op0;
4125 	      goto do_add;
4126 	    }
4127 	  break;
4128 	case MULT_EXPR:
4129 	  if (scale == 1 && tree_fits_shwi_p (op1))
4130 	    {
4131 	      int new_scale = tree_to_shwi (op1);
4132 	      /* Only treat this as a scaling operation if the target
4133 		 supports it for at least some offset type.  */
4134 	      if (use_ifn_p
4135 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4136 						masked_p, vectype, memory_type,
4137 						signed_char_type_node,
4138 						new_scale, &ifn,
4139 						&offset_vectype)
4140 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4141 						masked_p, vectype, memory_type,
4142 						unsigned_char_type_node,
4143 						new_scale, &ifn,
4144 						&offset_vectype))
4145 		break;
4146 	      scale = new_scale;
4147 	      off = op0;
4148 	      continue;
4149 	    }
4150 	  break;
4151 	case SSA_NAME:
4152 	  off = op0;
4153 	  continue;
4154 	CASE_CONVERT:
4155 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
4156 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4157 	    break;
4158 
4159 	  /* Don't include the conversion if the target is happy with
4160 	     the current offset type.  */
4161 	  if (use_ifn_p
4162 	      && !POINTER_TYPE_P (TREE_TYPE (off))
4163 	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4164 					   masked_p, vectype, memory_type,
4165 					   TREE_TYPE (off), scale, &ifn,
4166 					   &offset_vectype))
4167 	    break;
4168 
4169 	  if (TYPE_PRECISION (TREE_TYPE (op0))
4170 	      == TYPE_PRECISION (TREE_TYPE (off)))
4171 	    {
4172 	      off = op0;
4173 	      continue;
4174 	    }
4175 
4176 	  /* Include the conversion if it is widening and we're using
4177 	     the IFN path or the target can handle the converted from
4178 	     offset or the current size is not already the same as the
4179 	     data vector element size.  */
4180 	  if ((TYPE_PRECISION (TREE_TYPE (op0))
4181 	       < TYPE_PRECISION (TREE_TYPE (off)))
4182 	      && (use_ifn_p
4183 		  || (DR_IS_READ (dr)
4184 		      ? (targetm.vectorize.builtin_gather
4185 			 && targetm.vectorize.builtin_gather (vectype,
4186 							      TREE_TYPE (op0),
4187 							      scale))
4188 		      : (targetm.vectorize.builtin_scatter
4189 			 && targetm.vectorize.builtin_scatter (vectype,
4190 							       TREE_TYPE (op0),
4191 							       scale)))
4192 		  || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4193 				       TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4194 	    {
4195 	      off = op0;
4196 	      offtype = TREE_TYPE (off);
4197 	      STRIP_NOPS (off);
4198 	      continue;
4199 	    }
4200 	  break;
4201 	default:
4202 	  break;
4203 	}
4204       break;
4205     }
4206 
4207   /* If at the end OFF still isn't a SSA_NAME or isn't
4208      defined in the loop, punt.  */
4209   if (TREE_CODE (off) != SSA_NAME
4210       || expr_invariant_in_loop_p (loop, off))
4211     return false;
4212 
4213   if (offtype == NULL_TREE)
4214     offtype = TREE_TYPE (off);
4215 
4216   if (use_ifn_p)
4217     {
4218       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4219 				     vectype, memory_type, offtype, scale,
4220 				     &ifn, &offset_vectype))
4221 	ifn = IFN_LAST;
4222       decl = NULL_TREE;
4223     }
4224   else
4225     {
4226       if (DR_IS_READ (dr))
4227 	{
4228 	  if (targetm.vectorize.builtin_gather)
4229 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4230 	}
4231       else
4232 	{
4233 	  if (targetm.vectorize.builtin_scatter)
4234 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4235 	}
4236       ifn = IFN_LAST;
4237       /* The offset vector type will be read from DECL when needed.  */
4238       offset_vectype = NULL_TREE;
4239     }
4240 
4241   info->ifn = ifn;
4242   info->decl = decl;
4243   info->base = base;
4244   info->offset = off;
4245   info->offset_dt = vect_unknown_def_type;
4246   info->offset_vectype = offset_vectype;
4247   info->scale = scale;
4248   info->element_type = TREE_TYPE (vectype);
4249   info->memory_type = memory_type;
4250   return true;
4251 }
4252 
4253 /* Find the data references in STMT, analyze them with respect to LOOP and
4254    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4255    be handled.  */
4256 
4257 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs,vec<int> * dataref_groups,int group_id)4258 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4259 			       vec<data_reference_p> *datarefs,
4260 			       vec<int> *dataref_groups, int group_id)
4261 {
4262   /* We can ignore clobbers for dataref analysis - they are removed during
4263      loop vectorization and BB vectorization checks dependences with a
4264      stmt walk.  */
4265   if (gimple_clobber_p (stmt))
4266     return opt_result::success ();
4267 
4268   if (gimple_has_volatile_ops (stmt))
4269     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4270 				   stmt);
4271 
4272   if (stmt_can_throw_internal (cfun, stmt))
4273     return opt_result::failure_at (stmt,
4274 				   "not vectorized:"
4275 				   " statement can throw an exception: %G",
4276 				   stmt);
4277 
4278   auto_vec<data_reference_p, 2> refs;
4279   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4280   if (!res)
4281     return res;
4282 
4283   if (refs.is_empty ())
4284     return opt_result::success ();
4285 
4286   if (refs.length () > 1)
4287     {
4288       while (!refs.is_empty ())
4289 	free_data_ref (refs.pop ());
4290       return opt_result::failure_at (stmt,
4291 				     "not vectorized: more than one "
4292 				     "data ref in stmt: %G", stmt);
4293     }
4294 
4295   data_reference_p dr = refs.pop ();
4296   if (gcall *call = dyn_cast <gcall *> (stmt))
4297     if (!gimple_call_internal_p (call)
4298 	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4299 	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4300       {
4301 	free_data_ref (dr);
4302 	return opt_result::failure_at (stmt,
4303 				       "not vectorized: dr in a call %G", stmt);
4304       }
4305 
4306   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4307       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4308     {
4309       free_data_ref (dr);
4310       return opt_result::failure_at (stmt,
4311 				     "not vectorized:"
4312 				     " statement is bitfield access %G", stmt);
4313     }
4314 
4315   if (DR_BASE_ADDRESS (dr)
4316       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4317     {
4318       free_data_ref (dr);
4319       return opt_result::failure_at (stmt,
4320 				     "not vectorized:"
4321 				     " base addr of dr is a constant\n");
4322     }
4323 
4324   /* Check whether this may be a SIMD lane access and adjust the
4325      DR to make it easier for us to handle it.  */
4326   if (loop
4327       && loop->simduid
4328       && (!DR_BASE_ADDRESS (dr)
4329 	  || !DR_OFFSET (dr)
4330 	  || !DR_INIT (dr)
4331 	  || !DR_STEP (dr)))
4332     {
4333       struct data_reference *newdr
4334 	= create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4335 			   DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4336       if (DR_BASE_ADDRESS (newdr)
4337 	  && DR_OFFSET (newdr)
4338 	  && DR_INIT (newdr)
4339 	  && DR_STEP (newdr)
4340 	  && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4341 	  && integer_zerop (DR_STEP (newdr)))
4342 	{
4343 	  tree base_address = DR_BASE_ADDRESS (newdr);
4344 	  tree off = DR_OFFSET (newdr);
4345 	  tree step = ssize_int (1);
4346 	  if (integer_zerop (off)
4347 	      && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4348 	    {
4349 	      off = TREE_OPERAND (base_address, 1);
4350 	      base_address = TREE_OPERAND (base_address, 0);
4351 	    }
4352 	  STRIP_NOPS (off);
4353 	  if (TREE_CODE (off) == MULT_EXPR
4354 	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4355 	    {
4356 	      step = TREE_OPERAND (off, 1);
4357 	      off = TREE_OPERAND (off, 0);
4358 	      STRIP_NOPS (off);
4359 	    }
4360 	  if (CONVERT_EXPR_P (off)
4361 	      && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4362 		  < TYPE_PRECISION (TREE_TYPE (off))))
4363 	    off = TREE_OPERAND (off, 0);
4364 	  if (TREE_CODE (off) == SSA_NAME)
4365 	    {
4366 	      gimple *def = SSA_NAME_DEF_STMT (off);
4367 	      /* Look through widening conversion.  */
4368 	      if (is_gimple_assign (def)
4369 		  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4370 		{
4371 		  tree rhs1 = gimple_assign_rhs1 (def);
4372 		  if (TREE_CODE (rhs1) == SSA_NAME
4373 		      && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4374 		      && (TYPE_PRECISION (TREE_TYPE (off))
4375 			  > TYPE_PRECISION (TREE_TYPE (rhs1))))
4376 		    def = SSA_NAME_DEF_STMT (rhs1);
4377 		}
4378 	      if (is_gimple_call (def)
4379 		  && gimple_call_internal_p (def)
4380 		  && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4381 		{
4382 		  tree arg = gimple_call_arg (def, 0);
4383 		  tree reft = TREE_TYPE (DR_REF (newdr));
4384 		  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4385 		  arg = SSA_NAME_VAR (arg);
4386 		  if (arg == loop->simduid
4387 		      /* For now.  */
4388 		      && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4389 		    {
4390 		      DR_BASE_ADDRESS (newdr) = base_address;
4391 		      DR_OFFSET (newdr) = ssize_int (0);
4392 		      DR_STEP (newdr) = step;
4393 		      DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4394 		      DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4395 		      /* Mark as simd-lane access.  */
4396 		      tree arg2 = gimple_call_arg (def, 1);
4397 		      newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4398 		      free_data_ref (dr);
4399 		      datarefs->safe_push (newdr);
4400 		      if (dataref_groups)
4401 			dataref_groups->safe_push (group_id);
4402 		      return opt_result::success ();
4403 		    }
4404 		}
4405 	    }
4406 	}
4407       free_data_ref (newdr);
4408     }
4409 
4410   datarefs->safe_push (dr);
4411   if (dataref_groups)
4412     dataref_groups->safe_push (group_id);
4413   return opt_result::success ();
4414 }
4415 
4416 /* Function vect_analyze_data_refs.
4417 
4418   Find all the data references in the loop or basic block.
4419 
4420    The general structure of the analysis of data refs in the vectorizer is as
4421    follows:
4422    1- vect_analyze_data_refs(loop/bb): call
4423       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4424       in the loop/bb and their dependences.
4425    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4426    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4427    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4428 
4429 */
4430 
4431 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf,bool * fatal)4432 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4433 {
4434   class loop *loop = NULL;
4435   unsigned int i;
4436   struct data_reference *dr;
4437   tree scalar_type;
4438 
4439   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4440 
4441   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4442     loop = LOOP_VINFO_LOOP (loop_vinfo);
4443 
4444   /* Go through the data-refs, check that the analysis succeeded.  Update
4445      pointer from stmt_vec_info struct to DR and vectype.  */
4446 
4447   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4448   FOR_EACH_VEC_ELT (datarefs, i, dr)
4449     {
4450       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4451       poly_uint64 vf;
4452 
4453       gcc_assert (DR_REF (dr));
4454       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4455       gcc_assert (!stmt_info->dr_aux.dr);
4456       stmt_info->dr_aux.dr = dr;
4457       stmt_info->dr_aux.stmt = stmt_info;
4458 
4459       /* Check that analysis of the data-ref succeeded.  */
4460       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4461 	  || !DR_STEP (dr))
4462         {
4463 	  bool maybe_gather
4464 	    = DR_IS_READ (dr)
4465 	      && !TREE_THIS_VOLATILE (DR_REF (dr));
4466 	  bool maybe_scatter
4467 	    = DR_IS_WRITE (dr)
4468 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4469 	      && (targetm.vectorize.builtin_scatter != NULL
4470 		  || supports_vec_scatter_store_p ());
4471 
4472 	  /* If target supports vector gather loads or scatter stores,
4473 	     see if they can't be used.  */
4474 	  if (is_a <loop_vec_info> (vinfo)
4475 	      && !nested_in_vect_loop_p (loop, stmt_info))
4476 	    {
4477 	      if (maybe_gather || maybe_scatter)
4478 		{
4479 		  if (maybe_gather)
4480 		    gatherscatter = GATHER;
4481 		  else
4482 		    gatherscatter = SCATTER;
4483 		}
4484 	    }
4485 
4486 	  if (gatherscatter == SG_NONE)
4487 	    {
4488 	      if (dump_enabled_p ())
4489 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4490 				 "not vectorized: data ref analysis "
4491 				 "failed %G", stmt_info->stmt);
4492 	      if (is_a <bb_vec_info> (vinfo))
4493 		{
4494 		  /* In BB vectorization the ref can still participate
4495 		     in dependence analysis, we just can't vectorize it.  */
4496 		  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4497 		  continue;
4498 		}
4499 	      return opt_result::failure_at (stmt_info->stmt,
4500 					     "not vectorized:"
4501 					     " data ref analysis failed: %G",
4502 					     stmt_info->stmt);
4503 	    }
4504         }
4505 
4506       /* See if this was detected as SIMD lane access.  */
4507       if (dr->aux == (void *)-1
4508 	  || dr->aux == (void *)-2
4509 	  || dr->aux == (void *)-3
4510 	  || dr->aux == (void *)-4)
4511 	{
4512 	  if (nested_in_vect_loop_p (loop, stmt_info))
4513 	    return opt_result::failure_at (stmt_info->stmt,
4514 					   "not vectorized:"
4515 					   " data ref analysis failed: %G",
4516 					   stmt_info->stmt);
4517 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4518 	    = -(uintptr_t) dr->aux;
4519 	}
4520 
4521       tree base = get_base_address (DR_REF (dr));
4522       if (base && VAR_P (base) && DECL_NONALIASED (base))
4523 	{
4524           if (dump_enabled_p ())
4525 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4526 			     "not vectorized: base object not addressable "
4527 			     "for stmt: %G", stmt_info->stmt);
4528           if (is_a <bb_vec_info> (vinfo))
4529 	    {
4530 	      /* In BB vectorization the ref can still participate
4531 	         in dependence analysis, we just can't vectorize it.  */
4532 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4533 	      continue;
4534 	    }
4535 	  return opt_result::failure_at (stmt_info->stmt,
4536 					 "not vectorized: base object not"
4537 					 " addressable for stmt: %G",
4538 					 stmt_info->stmt);
4539 	}
4540 
4541       if (is_a <loop_vec_info> (vinfo)
4542 	  && DR_STEP (dr)
4543 	  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4544 	{
4545 	  if (nested_in_vect_loop_p (loop, stmt_info))
4546 	    return opt_result::failure_at (stmt_info->stmt,
4547 					   "not vectorized: "
4548 					   "not suitable for strided load %G",
4549 					   stmt_info->stmt);
4550 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4551 	}
4552 
4553       /* Update DR field in stmt_vec_info struct.  */
4554 
4555       /* If the dataref is in an inner-loop of the loop that is considered for
4556 	 for vectorization, we also want to analyze the access relative to
4557 	 the outer-loop (DR contains information only relative to the
4558 	 inner-most enclosing loop).  We do that by building a reference to the
4559 	 first location accessed by the inner-loop, and analyze it relative to
4560 	 the outer-loop.  */
4561       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4562 	{
4563 	  /* Build a reference to the first location accessed by the
4564 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4565 	     this address must be invariant in the inner loop, so we
4566 	     can consider it as being used in the outer loop.  */
4567 	  tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4568 	  tree offset = unshare_expr (DR_OFFSET (dr));
4569 	  tree init = unshare_expr (DR_INIT (dr));
4570 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4571 					  init, offset);
4572 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4573 	  tree init_ref = build_fold_indirect_ref (init_addr);
4574 
4575 	  if (dump_enabled_p ())
4576 	    dump_printf_loc (MSG_NOTE, vect_location,
4577 			     "analyze in outer loop: %T\n", init_ref);
4578 
4579 	  opt_result res
4580 	    = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4581 				    init_ref, loop, stmt_info->stmt);
4582 	  if (!res)
4583 	    /* dr_analyze_innermost already explained the failure.  */
4584 	    return res;
4585 
4586           if (dump_enabled_p ())
4587 	    dump_printf_loc (MSG_NOTE, vect_location,
4588 			     "\touter base_address: %T\n"
4589 			     "\touter offset from base address: %T\n"
4590 			     "\touter constant offset from base address: %T\n"
4591 			     "\touter step: %T\n"
4592 			     "\touter base alignment: %d\n\n"
4593 			     "\touter base misalignment: %d\n"
4594 			     "\touter offset alignment: %d\n"
4595 			     "\touter step alignment: %d\n",
4596 			     STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4597 			     STMT_VINFO_DR_OFFSET (stmt_info),
4598 			     STMT_VINFO_DR_INIT (stmt_info),
4599 			     STMT_VINFO_DR_STEP (stmt_info),
4600 			     STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4601 			     STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4602 			     STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4603 			     STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4604 	}
4605 
4606       /* Set vectype for STMT.  */
4607       scalar_type = TREE_TYPE (DR_REF (dr));
4608       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4609       if (!vectype)
4610         {
4611           if (dump_enabled_p ())
4612             {
4613               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4614                                "not vectorized: no vectype for stmt: %G",
4615 			       stmt_info->stmt);
4616               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4617               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4618                                  scalar_type);
4619               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4620             }
4621 
4622           if (is_a <bb_vec_info> (vinfo))
4623 	    {
4624 	      /* No vector type is fine, the ref can still participate
4625 	         in dependence analysis, we just can't vectorize it.  */
4626 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4627 	      continue;
4628 	    }
4629 	  if (fatal)
4630 	    *fatal = false;
4631 	  return opt_result::failure_at (stmt_info->stmt,
4632 					 "not vectorized:"
4633 					 " no vectype for stmt: %G"
4634 					 " scalar_type: %T\n",
4635 					 stmt_info->stmt, scalar_type);
4636         }
4637       else
4638 	{
4639 	  if (dump_enabled_p ())
4640 	    dump_printf_loc (MSG_NOTE, vect_location,
4641 			     "got vectype for stmt: %G%T\n",
4642 			     stmt_info->stmt, vectype);
4643 	}
4644 
4645       /* Adjust the minimal vectorization factor according to the
4646 	 vector type.  */
4647       vf = TYPE_VECTOR_SUBPARTS (vectype);
4648       *min_vf = upper_bound (*min_vf, vf);
4649 
4650       /* Leave the BB vectorizer to pick the vector type later, based on
4651 	 the final dataref group size and SLP node size.  */
4652       if (is_a <loop_vec_info> (vinfo))
4653 	STMT_VINFO_VECTYPE (stmt_info) = vectype;
4654 
4655       if (gatherscatter != SG_NONE)
4656 	{
4657 	  gather_scatter_info gs_info;
4658 	  if (!vect_check_gather_scatter (stmt_info,
4659 					  as_a <loop_vec_info> (vinfo),
4660 					  &gs_info)
4661 	      || !get_vectype_for_scalar_type (vinfo,
4662 					       TREE_TYPE (gs_info.offset)))
4663 	    {
4664 	      if (fatal)
4665 		*fatal = false;
4666 	      return opt_result::failure_at
4667 			(stmt_info->stmt,
4668 			 (gatherscatter == GATHER)
4669 			 ? "not vectorized: not suitable for gather load %G"
4670 			 : "not vectorized: not suitable for scatter store %G",
4671 			 stmt_info->stmt);
4672 	    }
4673 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4674 	}
4675     }
4676 
4677   /* We used to stop processing and prune the list here.  Verify we no
4678      longer need to.  */
4679   gcc_assert (i == datarefs.length ());
4680 
4681   return opt_result::success ();
4682 }
4683 
4684 
4685 /* Function vect_get_new_vect_var.
4686 
4687    Returns a name for a new variable.  The current naming scheme appends the
4688    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4689    the name of vectorizer generated variables, and appends that to NAME if
4690    provided.  */
4691 
4692 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4693 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4694 {
4695   const char *prefix;
4696   tree new_vect_var;
4697 
4698   switch (var_kind)
4699   {
4700   case vect_simple_var:
4701     prefix = "vect";
4702     break;
4703   case vect_scalar_var:
4704     prefix = "stmp";
4705     break;
4706   case vect_mask_var:
4707     prefix = "mask";
4708     break;
4709   case vect_pointer_var:
4710     prefix = "vectp";
4711     break;
4712   default:
4713     gcc_unreachable ();
4714   }
4715 
4716   if (name)
4717     {
4718       char* tmp = concat (prefix, "_", name, NULL);
4719       new_vect_var = create_tmp_reg (type, tmp);
4720       free (tmp);
4721     }
4722   else
4723     new_vect_var = create_tmp_reg (type, prefix);
4724 
4725   return new_vect_var;
4726 }
4727 
4728 /* Like vect_get_new_vect_var but return an SSA name.  */
4729 
4730 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4731 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4732 {
4733   const char *prefix;
4734   tree new_vect_var;
4735 
4736   switch (var_kind)
4737   {
4738   case vect_simple_var:
4739     prefix = "vect";
4740     break;
4741   case vect_scalar_var:
4742     prefix = "stmp";
4743     break;
4744   case vect_pointer_var:
4745     prefix = "vectp";
4746     break;
4747   default:
4748     gcc_unreachable ();
4749   }
4750 
4751   if (name)
4752     {
4753       char* tmp = concat (prefix, "_", name, NULL);
4754       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4755       free (tmp);
4756     }
4757   else
4758     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4759 
4760   return new_vect_var;
4761 }
4762 
4763 /* Duplicate points-to info on NAME from DR_INFO.  */
4764 
4765 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4766 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4767 {
4768   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4769   /* DR_PTR_INFO is for a base SSA name, not including constant or
4770      variable offsets in the ref so its alignment info does not apply.  */
4771   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4772 }
4773 
4774 /* Function vect_create_addr_base_for_vector_ref.
4775 
4776    Create an expression that computes the address of the first memory location
4777    that will be accessed for a data reference.
4778 
4779    Input:
4780    STMT_INFO: The statement containing the data reference.
4781    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4782    OFFSET: Optional. If supplied, it is be added to the initial address.
4783    LOOP:    Specify relative to which loop-nest should the address be computed.
4784             For example, when the dataref is in an inner-loop nested in an
4785 	    outer-loop that is now being vectorized, LOOP can be either the
4786 	    outer-loop, or the inner-loop.  The first memory location accessed
4787 	    by the following dataref ('in' points to short):
4788 
4789 		for (i=0; i<N; i++)
4790 		   for (j=0; j<M; j++)
4791 		     s += in[i+j]
4792 
4793 	    is as follows:
4794 	    if LOOP=i_loop:	&in		(relative to i_loop)
4795 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4796 
4797    Output:
4798    1. Return an SSA_NAME whose value is the address of the memory location of
4799       the first vector of the data reference.
4800    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4801       these statement(s) which define the returned SSA_NAME.
4802 
4803    FORNOW: We are only handling array accesses with step 1.  */
4804 
4805 tree
vect_create_addr_base_for_vector_ref(vec_info * vinfo,stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset)4806 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4807 				      gimple_seq *new_stmt_list,
4808 				      tree offset)
4809 {
4810   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4811   struct data_reference *dr = dr_info->dr;
4812   const char *base_name;
4813   tree addr_base;
4814   tree dest;
4815   gimple_seq seq = NULL;
4816   tree vect_ptr_type;
4817   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4818   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4819 
4820   tree data_ref_base = unshare_expr (drb->base_address);
4821   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4822   tree init = unshare_expr (drb->init);
4823 
4824   if (loop_vinfo)
4825     base_name = get_name (data_ref_base);
4826   else
4827     {
4828       base_offset = ssize_int (0);
4829       init = ssize_int (0);
4830       base_name = get_name (DR_REF (dr));
4831     }
4832 
4833   /* Create base_offset */
4834   base_offset = size_binop (PLUS_EXPR,
4835 			    fold_convert (sizetype, base_offset),
4836 			    fold_convert (sizetype, init));
4837 
4838   if (offset)
4839     {
4840       offset = fold_convert (sizetype, offset);
4841       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4842 				 base_offset, offset);
4843     }
4844 
4845   /* base + base_offset */
4846   if (loop_vinfo)
4847     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4848   else
4849     addr_base = build1 (ADDR_EXPR,
4850 			build_pointer_type (TREE_TYPE (DR_REF (dr))),
4851 			/* Strip zero offset components since we don't need
4852 			   them and they can confuse late diagnostics if
4853 			   we CSE them wrongly.  See PR106904 for example.  */
4854 			unshare_expr (strip_zero_offset_components
4855 								(DR_REF (dr))));
4856 
4857   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4858   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4859   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4860   gimple_seq_add_seq (new_stmt_list, seq);
4861 
4862   if (DR_PTR_INFO (dr)
4863       && TREE_CODE (addr_base) == SSA_NAME
4864       /* We should only duplicate pointer info to newly created SSA names.  */
4865       && SSA_NAME_VAR (addr_base) == dest)
4866     {
4867       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4868       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4869     }
4870 
4871   if (dump_enabled_p ())
4872     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4873 
4874   return addr_base;
4875 }
4876 
4877 
4878 /* Function vect_create_data_ref_ptr.
4879 
4880    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4881    location accessed in the loop by STMT_INFO, along with the def-use update
4882    chain to appropriately advance the pointer through the loop iterations.
4883    Also set aliasing information for the pointer.  This pointer is used by
4884    the callers to this function to create a memory reference expression for
4885    vector load/store access.
4886 
4887    Input:
4888    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4889          GIMPLE_ASSIGN <name, data-ref> or
4890 	 GIMPLE_ASSIGN <data-ref, name>.
4891    2. AGGR_TYPE: the type of the reference, which should be either a vector
4892         or an array.
4893    3. AT_LOOP: the loop where the vector memref is to be created.
4894    4. OFFSET (optional): a byte offset to be added to the initial address
4895 	accessed by the data-ref in STMT_INFO.
4896    5. BSI: location where the new stmts are to be placed if there is no loop
4897    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4898         pointing to the initial address.
4899    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4900 	to the IV during each iteration of the loop.  NULL says to move
4901 	by one copy of AGGR_TYPE up or down, depending on the step of the
4902 	data reference.
4903 
4904    Output:
4905    1. Declare a new ptr to vector_type, and have it point to the base of the
4906       data reference (initial addressed accessed by the data reference).
4907       For example, for vector of type V8HI, the following code is generated:
4908 
4909       v8hi *ap;
4910       ap = (v8hi *)initial_address;
4911 
4912       if OFFSET is not supplied:
4913          initial_address = &a[init];
4914       if OFFSET is supplied:
4915 	 initial_address = &a[init] + OFFSET;
4916       if BYTE_OFFSET is supplied:
4917 	 initial_address = &a[init] + BYTE_OFFSET;
4918 
4919       Return the initial_address in INITIAL_ADDRESS.
4920 
4921    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4922       update the pointer in each iteration of the loop.
4923 
4924       Return the increment stmt that updates the pointer in PTR_INCR.
4925 
4926    3. Return the pointer.  */
4927 
4928 tree
vect_create_data_ref_ptr(vec_info * vinfo,stmt_vec_info stmt_info,tree aggr_type,class loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree iv_step)4929 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4930 			  tree aggr_type, class loop *at_loop, tree offset,
4931 			  tree *initial_address, gimple_stmt_iterator *gsi,
4932 			  gimple **ptr_incr, bool only_init,
4933 			  tree iv_step)
4934 {
4935   const char *base_name;
4936   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4937   class loop *loop = NULL;
4938   bool nested_in_vect_loop = false;
4939   class loop *containing_loop = NULL;
4940   tree aggr_ptr_type;
4941   tree aggr_ptr;
4942   tree new_temp;
4943   gimple_seq new_stmt_list = NULL;
4944   edge pe = NULL;
4945   basic_block new_bb;
4946   tree aggr_ptr_init;
4947   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4948   struct data_reference *dr = dr_info->dr;
4949   tree aptr;
4950   gimple_stmt_iterator incr_gsi;
4951   bool insert_after;
4952   tree indx_before_incr, indx_after_incr;
4953   gimple *incr;
4954   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4955 
4956   gcc_assert (iv_step != NULL_TREE
4957 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4958 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4959 
4960   if (loop_vinfo)
4961     {
4962       loop = LOOP_VINFO_LOOP (loop_vinfo);
4963       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4964       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4965       pe = loop_preheader_edge (loop);
4966     }
4967   else
4968     {
4969       gcc_assert (bb_vinfo);
4970       only_init = true;
4971       *ptr_incr = NULL;
4972     }
4973 
4974   /* Create an expression for the first address accessed by this load
4975      in LOOP.  */
4976   base_name = get_name (DR_BASE_ADDRESS (dr));
4977 
4978   if (dump_enabled_p ())
4979     {
4980       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4981       dump_printf_loc (MSG_NOTE, vect_location,
4982                        "create %s-pointer variable to type: %T",
4983 		       get_tree_code_name (TREE_CODE (aggr_type)),
4984 		       aggr_type);
4985       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4986         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4987       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4988         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4989       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4990         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4991       else
4992         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4993       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4994     }
4995 
4996   /* (1) Create the new aggregate-pointer variable.
4997      Vector and array types inherit the alias set of their component
4998      type by default so we need to use a ref-all pointer if the data
4999      reference does not conflict with the created aggregated data
5000      reference because it is not addressable.  */
5001   bool need_ref_all = false;
5002   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5003 			      get_alias_set (DR_REF (dr))))
5004     need_ref_all = true;
5005   /* Likewise for any of the data references in the stmt group.  */
5006   else if (DR_GROUP_SIZE (stmt_info) > 1)
5007     {
5008       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5009       do
5010 	{
5011 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5012 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5013 				      get_alias_set (DR_REF (sdr))))
5014 	    {
5015 	      need_ref_all = true;
5016 	      break;
5017 	    }
5018 	  sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5019 	}
5020       while (sinfo);
5021     }
5022   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5023 					       need_ref_all);
5024   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5025 
5026 
5027   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5028      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5029      def-use update cycles for the pointer: one relative to the outer-loop
5030      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5031      to the inner-loop (which is the inner-most loop containing the dataref),
5032      and this is done be step (5) below.
5033 
5034      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5035      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5036      redundant.  Steps (3),(4) create the following:
5037 
5038 	vp0 = &base_addr;
5039 	LOOP:	vp1 = phi(vp0,vp2)
5040 		...
5041 		...
5042 		vp2 = vp1 + step
5043 		goto LOOP
5044 
5045      If there is an inner-loop nested in loop, then step (5) will also be
5046      applied, and an additional update in the inner-loop will be created:
5047 
5048 	vp0 = &base_addr;
5049 	LOOP:   vp1 = phi(vp0,vp2)
5050 		...
5051         inner:     vp3 = phi(vp1,vp4)
5052 	           vp4 = vp3 + inner_step
5053 	           if () goto inner
5054 		...
5055 		vp2 = vp1 + step
5056 		if () goto LOOP   */
5057 
5058   /* (2) Calculate the initial address of the aggregate-pointer, and set
5059      the aggregate-pointer to point to it before the loop.  */
5060 
5061   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5062 
5063   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5064 						   stmt_info, &new_stmt_list,
5065 						   offset);
5066   if (new_stmt_list)
5067     {
5068       if (pe)
5069         {
5070           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5071           gcc_assert (!new_bb);
5072         }
5073       else
5074         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5075     }
5076 
5077   *initial_address = new_temp;
5078   aggr_ptr_init = new_temp;
5079 
5080   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5081      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5082      inner-loop nested in LOOP (during outer-loop vectorization).  */
5083 
5084   /* No update in loop is required.  */
5085   if (only_init && (!loop_vinfo || at_loop == loop))
5086     aptr = aggr_ptr_init;
5087   else
5088     {
5089       /* Accesses to invariant addresses should be handled specially
5090 	 by the caller.  */
5091       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5092       gcc_assert (!integer_zerop (step));
5093 
5094       if (iv_step == NULL_TREE)
5095 	{
5096 	  /* The step of the aggregate pointer is the type size,
5097 	     negated for downward accesses.  */
5098 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
5099 	  if (tree_int_cst_sgn (step) == -1)
5100 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5101 	}
5102 
5103       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5104 
5105       create_iv (aggr_ptr_init,
5106 		 fold_convert (aggr_ptr_type, iv_step),
5107 		 aggr_ptr, loop, &incr_gsi, insert_after,
5108 		 &indx_before_incr, &indx_after_incr);
5109       incr = gsi_stmt (incr_gsi);
5110 
5111       /* Copy the points-to information if it exists. */
5112       if (DR_PTR_INFO (dr))
5113 	{
5114 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5115 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5116 	}
5117       if (ptr_incr)
5118 	*ptr_incr = incr;
5119 
5120       aptr = indx_before_incr;
5121     }
5122 
5123   if (!nested_in_vect_loop || only_init)
5124     return aptr;
5125 
5126 
5127   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5128      nested in LOOP, if exists.  */
5129 
5130   gcc_assert (nested_in_vect_loop);
5131   if (!only_init)
5132     {
5133       standard_iv_increment_position (containing_loop, &incr_gsi,
5134 				      &insert_after);
5135       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5136 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5137 		 &indx_after_incr);
5138       incr = gsi_stmt (incr_gsi);
5139 
5140       /* Copy the points-to information if it exists. */
5141       if (DR_PTR_INFO (dr))
5142 	{
5143 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5144 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5145 	}
5146       if (ptr_incr)
5147 	*ptr_incr = incr;
5148 
5149       return indx_before_incr;
5150     }
5151   else
5152     gcc_unreachable ();
5153 }
5154 
5155 
5156 /* Function bump_vector_ptr
5157 
5158    Increment a pointer (to a vector type) by vector-size. If requested,
5159    i.e. if PTR-INCR is given, then also connect the new increment stmt
5160    to the existing def-use update-chain of the pointer, by modifying
5161    the PTR_INCR as illustrated below:
5162 
5163    The pointer def-use update-chain before this function:
5164                         DATAREF_PTR = phi (p_0, p_2)
5165                         ....
5166         PTR_INCR:       p_2 = DATAREF_PTR + step
5167 
5168    The pointer def-use update-chain after this function:
5169                         DATAREF_PTR = phi (p_0, p_2)
5170                         ....
5171                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5172                         ....
5173         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5174 
5175    Input:
5176    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5177                  in the loop.
5178    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5179 	      the loop.  The increment amount across iterations is expected
5180 	      to be vector_size.
5181    BSI - location where the new update stmt is to be placed.
5182    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5183    BUMP - optional. The offset by which to bump the pointer. If not given,
5184 	  the offset is assumed to be vector_size.
5185 
5186    Output: Return NEW_DATAREF_PTR as illustrated above.
5187 
5188 */
5189 
5190 tree
bump_vector_ptr(vec_info * vinfo,tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)5191 bump_vector_ptr (vec_info *vinfo,
5192 		 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5193 		 stmt_vec_info stmt_info, tree bump)
5194 {
5195   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5196   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5197   tree update = TYPE_SIZE_UNIT (vectype);
5198   gimple *incr_stmt;
5199   ssa_op_iter iter;
5200   use_operand_p use_p;
5201   tree new_dataref_ptr;
5202 
5203   if (bump)
5204     update = bump;
5205 
5206   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5207     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5208   else
5209     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5210   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5211 				   dataref_ptr, update);
5212   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5213   /* Fold the increment, avoiding excessive chains use-def chains of
5214      those, leading to compile-time issues for passes until the next
5215      forwprop pass which would do this as well.  */
5216   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5217   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5218     {
5219       incr_stmt = gsi_stmt (fold_gsi);
5220       update_stmt (incr_stmt);
5221     }
5222 
5223   /* Copy the points-to information if it exists. */
5224   if (DR_PTR_INFO (dr))
5225     {
5226       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5227       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5228     }
5229 
5230   if (!ptr_incr)
5231     return new_dataref_ptr;
5232 
5233   /* Update the vector-pointer's cross-iteration increment.  */
5234   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5235     {
5236       tree use = USE_FROM_PTR (use_p);
5237 
5238       if (use == dataref_ptr)
5239         SET_USE (use_p, new_dataref_ptr);
5240       else
5241         gcc_assert (operand_equal_p (use, update, 0));
5242     }
5243 
5244   return new_dataref_ptr;
5245 }
5246 
5247 
5248 /* Copy memory reference info such as base/clique from the SRC reference
5249    to the DEST MEM_REF.  */
5250 
5251 void
vect_copy_ref_info(tree dest,tree src)5252 vect_copy_ref_info (tree dest, tree src)
5253 {
5254   if (TREE_CODE (dest) != MEM_REF)
5255     return;
5256 
5257   tree src_base = src;
5258   while (handled_component_p (src_base))
5259     src_base = TREE_OPERAND (src_base, 0);
5260   if (TREE_CODE (src_base) != MEM_REF
5261       && TREE_CODE (src_base) != TARGET_MEM_REF)
5262     return;
5263 
5264   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5265   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5266 }
5267 
5268 
5269 /* Function vect_create_destination_var.
5270 
5271    Create a new temporary of type VECTYPE.  */
5272 
5273 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5274 vect_create_destination_var (tree scalar_dest, tree vectype)
5275 {
5276   tree vec_dest;
5277   const char *name;
5278   char *new_name;
5279   tree type;
5280   enum vect_var_kind kind;
5281 
5282   kind = vectype
5283     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5284     ? vect_mask_var
5285     : vect_simple_var
5286     : vect_scalar_var;
5287   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5288 
5289   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5290 
5291   name = get_name (scalar_dest);
5292   if (name)
5293     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5294   else
5295     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5296   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5297   free (new_name);
5298 
5299   return vec_dest;
5300 }
5301 
5302 /* Function vect_grouped_store_supported.
5303 
5304    Returns TRUE if interleave high and interleave low permutations
5305    are supported, and FALSE otherwise.  */
5306 
5307 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5308 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5309 {
5310   machine_mode mode = TYPE_MODE (vectype);
5311 
5312   /* vect_permute_store_chain requires the group size to be equal to 3 or
5313      be a power of two.  */
5314   if (count != 3 && exact_log2 (count) == -1)
5315     {
5316       if (dump_enabled_p ())
5317 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5318 			 "the size of the group of accesses"
5319 			 " is not a power of 2 or not eqaul to 3\n");
5320       return false;
5321     }
5322 
5323   /* Check that the permutation is supported.  */
5324   if (VECTOR_MODE_P (mode))
5325     {
5326       unsigned int i;
5327       if (count == 3)
5328 	{
5329 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5330 	  unsigned int i, j;
5331 
5332 	  unsigned int nelt;
5333 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5334 	    {
5335 	      if (dump_enabled_p ())
5336 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337 				 "cannot handle groups of 3 stores for"
5338 				 " variable-length vectors\n");
5339 	      return false;
5340 	    }
5341 
5342 	  vec_perm_builder sel (nelt, nelt, 1);
5343 	  sel.quick_grow (nelt);
5344 	  vec_perm_indices indices;
5345 	  for (j = 0; j < 3; j++)
5346 	    {
5347 	      int nelt0 = ((3 - j) * nelt) % 3;
5348 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5349 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5350 	      for (i = 0; i < nelt; i++)
5351 		{
5352 		  if (3 * i + nelt0 < nelt)
5353 		    sel[3 * i + nelt0] = j0++;
5354 		  if (3 * i + nelt1 < nelt)
5355 		    sel[3 * i + nelt1] = nelt + j1++;
5356 		  if (3 * i + nelt2 < nelt)
5357 		    sel[3 * i + nelt2] = 0;
5358 		}
5359 	      indices.new_vector (sel, 2, nelt);
5360 	      if (!can_vec_perm_const_p (mode, indices))
5361 		{
5362 		  if (dump_enabled_p ())
5363 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5364 				 "permutation op not supported by target.\n");
5365 		  return false;
5366 		}
5367 
5368 	      for (i = 0; i < nelt; i++)
5369 		{
5370 		  if (3 * i + nelt0 < nelt)
5371 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5372 		  if (3 * i + nelt1 < nelt)
5373 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5374 		  if (3 * i + nelt2 < nelt)
5375 		    sel[3 * i + nelt2] = nelt + j2++;
5376 		}
5377 	      indices.new_vector (sel, 2, nelt);
5378 	      if (!can_vec_perm_const_p (mode, indices))
5379 		{
5380 		  if (dump_enabled_p ())
5381 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5382 				 "permutation op not supported by target.\n");
5383 		  return false;
5384 		}
5385 	    }
5386 	  return true;
5387 	}
5388       else
5389 	{
5390 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5391 	  gcc_assert (pow2p_hwi (count));
5392 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5393 
5394 	  /* The encoding has 2 interleaved stepped patterns.  */
5395 	  vec_perm_builder sel (nelt, 2, 3);
5396 	  sel.quick_grow (6);
5397 	  for (i = 0; i < 3; i++)
5398 	    {
5399 	      sel[i * 2] = i;
5400 	      sel[i * 2 + 1] = i + nelt;
5401 	    }
5402 	  vec_perm_indices indices (sel, 2, nelt);
5403 	  if (can_vec_perm_const_p (mode, indices))
5404 	    {
5405 	      for (i = 0; i < 6; i++)
5406 		sel[i] += exact_div (nelt, 2);
5407 	      indices.new_vector (sel, 2, nelt);
5408 	      if (can_vec_perm_const_p (mode, indices))
5409 		return true;
5410 	    }
5411 	}
5412     }
5413 
5414   if (dump_enabled_p ())
5415     dump_printf (MSG_MISSED_OPTIMIZATION,
5416 		 "permutation op not supported by target.\n");
5417   return false;
5418 }
5419 
5420 
5421 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5422    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5423 
5424 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5425 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5426 			    bool masked_p)
5427 {
5428   if (masked_p)
5429     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5430 					 vec_mask_store_lanes_optab,
5431 					 vectype, count);
5432   else
5433     return vect_lanes_optab_supported_p ("vec_store_lanes",
5434 					 vec_store_lanes_optab,
5435 					 vectype, count);
5436 }
5437 
5438 
5439 /* Function vect_permute_store_chain.
5440 
5441    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5442    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5443    the data correctly for the stores.  Return the final references for stores
5444    in RESULT_CHAIN.
5445 
5446    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5447    The input is 4 vectors each containing 8 elements.  We assign a number to
5448    each element, the input sequence is:
5449 
5450    1st vec:   0  1  2  3  4  5  6  7
5451    2nd vec:   8  9 10 11 12 13 14 15
5452    3rd vec:  16 17 18 19 20 21 22 23
5453    4th vec:  24 25 26 27 28 29 30 31
5454 
5455    The output sequence should be:
5456 
5457    1st vec:  0  8 16 24  1  9 17 25
5458    2nd vec:  2 10 18 26  3 11 19 27
5459    3rd vec:  4 12 20 28  5 13 21 30
5460    4th vec:  6 14 22 30  7 15 23 31
5461 
5462    i.e., we interleave the contents of the four vectors in their order.
5463 
5464    We use interleave_high/low instructions to create such output.  The input of
5465    each interleave_high/low operation is two vectors:
5466    1st vec    2nd vec
5467    0 1 2 3    4 5 6 7
5468    the even elements of the result vector are obtained left-to-right from the
5469    high/low elements of the first vector.  The odd elements of the result are
5470    obtained left-to-right from the high/low elements of the second vector.
5471    The output of interleave_high will be:   0 4 1 5
5472    and of interleave_low:                   2 6 3 7
5473 
5474 
5475    The permutation is done in log LENGTH stages.  In each stage interleave_high
5476    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5477    where the first argument is taken from the first half of DR_CHAIN and the
5478    second argument from it's second half.
5479    In our example,
5480 
5481    I1: interleave_high (1st vec, 3rd vec)
5482    I2: interleave_low (1st vec, 3rd vec)
5483    I3: interleave_high (2nd vec, 4th vec)
5484    I4: interleave_low (2nd vec, 4th vec)
5485 
5486    The output for the first stage is:
5487 
5488    I1:  0 16  1 17  2 18  3 19
5489    I2:  4 20  5 21  6 22  7 23
5490    I3:  8 24  9 25 10 26 11 27
5491    I4: 12 28 13 29 14 30 15 31
5492 
5493    The output of the second stage, i.e. the final result is:
5494 
5495    I1:  0  8 16 24  1  9 17 25
5496    I2:  2 10 18 26  3 11 19 27
5497    I3:  4 12 20 28  5 13 21 30
5498    I4:  6 14 22 30  7 15 23 31.  */
5499 
5500 void
vect_permute_store_chain(vec_info * vinfo,vec<tree> & dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5501 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5502 			  unsigned int length,
5503 			  stmt_vec_info stmt_info,
5504 			  gimple_stmt_iterator *gsi,
5505 			  vec<tree> *result_chain)
5506 {
5507   tree vect1, vect2, high, low;
5508   gimple *perm_stmt;
5509   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5510   tree perm_mask_low, perm_mask_high;
5511   tree data_ref;
5512   tree perm3_mask_low, perm3_mask_high;
5513   unsigned int i, j, n, log_length = exact_log2 (length);
5514 
5515   result_chain->quick_grow (length);
5516   memcpy (result_chain->address (), dr_chain.address (),
5517 	  length * sizeof (tree));
5518 
5519   if (length == 3)
5520     {
5521       /* vect_grouped_store_supported ensures that this is constant.  */
5522       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5523       unsigned int j0 = 0, j1 = 0, j2 = 0;
5524 
5525       vec_perm_builder sel (nelt, nelt, 1);
5526       sel.quick_grow (nelt);
5527       vec_perm_indices indices;
5528       for (j = 0; j < 3; j++)
5529         {
5530 	  int nelt0 = ((3 - j) * nelt) % 3;
5531 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5532 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5533 
5534 	  for (i = 0; i < nelt; i++)
5535 	    {
5536 	      if (3 * i + nelt0 < nelt)
5537 		sel[3 * i + nelt0] = j0++;
5538 	      if (3 * i + nelt1 < nelt)
5539 		sel[3 * i + nelt1] = nelt + j1++;
5540 	      if (3 * i + nelt2 < nelt)
5541 		sel[3 * i + nelt2] = 0;
5542 	    }
5543 	  indices.new_vector (sel, 2, nelt);
5544 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5545 
5546 	  for (i = 0; i < nelt; i++)
5547 	    {
5548 	      if (3 * i + nelt0 < nelt)
5549 		sel[3 * i + nelt0] = 3 * i + nelt0;
5550 	      if (3 * i + nelt1 < nelt)
5551 		sel[3 * i + nelt1] = 3 * i + nelt1;
5552 	      if (3 * i + nelt2 < nelt)
5553 		sel[3 * i + nelt2] = nelt + j2++;
5554 	    }
5555 	  indices.new_vector (sel, 2, nelt);
5556 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5557 
5558 	  vect1 = dr_chain[0];
5559 	  vect2 = dr_chain[1];
5560 
5561 	  /* Create interleaving stmt:
5562 	     low = VEC_PERM_EXPR <vect1, vect2,
5563 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5564 				   j + 2, nelt + j + 2, *, ...}>  */
5565 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5566 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5567 					   vect2, perm3_mask_low);
5568 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5569 
5570 	  vect1 = data_ref;
5571 	  vect2 = dr_chain[2];
5572 	  /* Create interleaving stmt:
5573 	     low = VEC_PERM_EXPR <vect1, vect2,
5574 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5575 				   6, 7, nelt + j + 2, ...}>  */
5576 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5577 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5578 					   vect2, perm3_mask_high);
5579 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5580 	  (*result_chain)[j] = data_ref;
5581 	}
5582     }
5583   else
5584     {
5585       /* If length is not equal to 3 then only power of 2 is supported.  */
5586       gcc_assert (pow2p_hwi (length));
5587 
5588       /* The encoding has 2 interleaved stepped patterns.  */
5589       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5590       vec_perm_builder sel (nelt, 2, 3);
5591       sel.quick_grow (6);
5592       for (i = 0; i < 3; i++)
5593 	{
5594 	  sel[i * 2] = i;
5595 	  sel[i * 2 + 1] = i + nelt;
5596 	}
5597 	vec_perm_indices indices (sel, 2, nelt);
5598 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5599 
5600 	for (i = 0; i < 6; i++)
5601 	  sel[i] += exact_div (nelt, 2);
5602 	indices.new_vector (sel, 2, nelt);
5603 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5604 
5605 	for (i = 0, n = log_length; i < n; i++)
5606 	  {
5607 	    for (j = 0; j < length/2; j++)
5608 	      {
5609 		vect1 = dr_chain[j];
5610 		vect2 = dr_chain[j+length/2];
5611 
5612 		/* Create interleaving stmt:
5613 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5614 							...}>  */
5615 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5616 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5617 						 vect2, perm_mask_high);
5618 		vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5619 		(*result_chain)[2*j] = high;
5620 
5621 		/* Create interleaving stmt:
5622 		   low = VEC_PERM_EXPR <vect1, vect2,
5623 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5624 					 ...}>  */
5625 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5626 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5627 						 vect2, perm_mask_low);
5628 		vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5629 		(*result_chain)[2*j+1] = low;
5630 	      }
5631 	    memcpy (dr_chain.address (), result_chain->address (),
5632 		    length * sizeof (tree));
5633 	  }
5634     }
5635 }
5636 
5637 /* Function vect_setup_realignment
5638 
5639    This function is called when vectorizing an unaligned load using
5640    the dr_explicit_realign[_optimized] scheme.
5641    This function generates the following code at the loop prolog:
5642 
5643       p = initial_addr;
5644    x  msq_init = *(floor(p));   # prolog load
5645       realignment_token = call target_builtin;
5646     loop:
5647    x  msq = phi (msq_init, ---)
5648 
5649    The stmts marked with x are generated only for the case of
5650    dr_explicit_realign_optimized.
5651 
5652    The code above sets up a new (vector) pointer, pointing to the first
5653    location accessed by STMT_INFO, and a "floor-aligned" load using that
5654    pointer.  It also generates code to compute the "realignment-token"
5655    (if the relevant target hook was defined), and creates a phi-node at the
5656    loop-header bb whose arguments are the result of the prolog-load (created
5657    by this function) and the result of a load that takes place in the loop
5658    (to be created by the caller to this function).
5659 
5660    For the case of dr_explicit_realign_optimized:
5661    The caller to this function uses the phi-result (msq) to create the
5662    realignment code inside the loop, and sets up the missing phi argument,
5663    as follows:
5664     loop:
5665       msq = phi (msq_init, lsq)
5666       lsq = *(floor(p'));        # load in loop
5667       result = realign_load (msq, lsq, realignment_token);
5668 
5669    For the case of dr_explicit_realign:
5670     loop:
5671       msq = *(floor(p)); 	# load in loop
5672       p' = p + (VS-1);
5673       lsq = *(floor(p'));	# load in loop
5674       result = realign_load (msq, lsq, realignment_token);
5675 
5676    Input:
5677    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5678 	       a memory location that may be unaligned.
5679    BSI - place where new code is to be inserted.
5680    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5681 			      is used.
5682 
5683    Output:
5684    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5685                        target hook, if defined.
5686    Return value - the result of the loop-header phi node.  */
5687 
5688 tree
vect_setup_realignment(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,class loop ** at_loop)5689 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5690 			gimple_stmt_iterator *gsi, tree *realignment_token,
5691 			enum dr_alignment_support alignment_support_scheme,
5692 			tree init_addr,
5693 			class loop **at_loop)
5694 {
5695   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5696   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5697   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5698   struct data_reference *dr = dr_info->dr;
5699   class loop *loop = NULL;
5700   edge pe = NULL;
5701   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5702   tree vec_dest;
5703   gimple *inc;
5704   tree ptr;
5705   tree data_ref;
5706   basic_block new_bb;
5707   tree msq_init = NULL_TREE;
5708   tree new_temp;
5709   gphi *phi_stmt;
5710   tree msq = NULL_TREE;
5711   gimple_seq stmts = NULL;
5712   bool compute_in_loop = false;
5713   bool nested_in_vect_loop = false;
5714   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5715   class loop *loop_for_initial_load = NULL;
5716 
5717   if (loop_vinfo)
5718     {
5719       loop = LOOP_VINFO_LOOP (loop_vinfo);
5720       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5721     }
5722 
5723   gcc_assert (alignment_support_scheme == dr_explicit_realign
5724 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5725 
5726   /* We need to generate three things:
5727      1. the misalignment computation
5728      2. the extra vector load (for the optimized realignment scheme).
5729      3. the phi node for the two vectors from which the realignment is
5730       done (for the optimized realignment scheme).  */
5731 
5732   /* 1. Determine where to generate the misalignment computation.
5733 
5734      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5735      calculation will be generated by this function, outside the loop (in the
5736      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5737      caller, inside the loop.
5738 
5739      Background: If the misalignment remains fixed throughout the iterations of
5740      the loop, then both realignment schemes are applicable, and also the
5741      misalignment computation can be done outside LOOP.  This is because we are
5742      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5743      are a multiple of VS (the Vector Size), and therefore the misalignment in
5744      different vectorized LOOP iterations is always the same.
5745      The problem arises only if the memory access is in an inner-loop nested
5746      inside LOOP, which is now being vectorized using outer-loop vectorization.
5747      This is the only case when the misalignment of the memory access may not
5748      remain fixed throughout the iterations of the inner-loop (as explained in
5749      detail in vect_supportable_dr_alignment).  In this case, not only is the
5750      optimized realignment scheme not applicable, but also the misalignment
5751      computation (and generation of the realignment token that is passed to
5752      REALIGN_LOAD) have to be done inside the loop.
5753 
5754      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5755      or not, which in turn determines if the misalignment is computed inside
5756      the inner-loop, or outside LOOP.  */
5757 
5758   if (init_addr != NULL_TREE || !loop_vinfo)
5759     {
5760       compute_in_loop = true;
5761       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5762     }
5763 
5764 
5765   /* 2. Determine where to generate the extra vector load.
5766 
5767      For the optimized realignment scheme, instead of generating two vector
5768      loads in each iteration, we generate a single extra vector load in the
5769      preheader of the loop, and in each iteration reuse the result of the
5770      vector load from the previous iteration.  In case the memory access is in
5771      an inner-loop nested inside LOOP, which is now being vectorized using
5772      outer-loop vectorization, we need to determine whether this initial vector
5773      load should be generated at the preheader of the inner-loop, or can be
5774      generated at the preheader of LOOP.  If the memory access has no evolution
5775      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5776      to be generated inside LOOP (in the preheader of the inner-loop).  */
5777 
5778   if (nested_in_vect_loop)
5779     {
5780       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5781       bool invariant_in_outerloop =
5782             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5783       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5784     }
5785   else
5786     loop_for_initial_load = loop;
5787   if (at_loop)
5788     *at_loop = loop_for_initial_load;
5789 
5790   if (loop_for_initial_load)
5791     pe = loop_preheader_edge (loop_for_initial_load);
5792 
5793   /* 3. For the case of the optimized realignment, create the first vector
5794       load at the loop preheader.  */
5795 
5796   if (alignment_support_scheme == dr_explicit_realign_optimized)
5797     {
5798       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5799       gassign *new_stmt;
5800 
5801       gcc_assert (!compute_in_loop);
5802       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5803       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5804 				      loop_for_initial_load, NULL_TREE,
5805 				      &init_addr, NULL, &inc, true);
5806       if (TREE_CODE (ptr) == SSA_NAME)
5807 	new_temp = copy_ssa_name (ptr);
5808       else
5809 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5810       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5811       tree type = TREE_TYPE (ptr);
5812       new_stmt = gimple_build_assign
5813 		   (new_temp, BIT_AND_EXPR, ptr,
5814 		    fold_build2 (MINUS_EXPR, type,
5815 				 build_int_cst (type, 0),
5816 				 build_int_cst (type, align)));
5817       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5818       gcc_assert (!new_bb);
5819       data_ref
5820 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5821 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5822       vect_copy_ref_info (data_ref, DR_REF (dr));
5823       new_stmt = gimple_build_assign (vec_dest, data_ref);
5824       new_temp = make_ssa_name (vec_dest, new_stmt);
5825       gimple_assign_set_lhs (new_stmt, new_temp);
5826       if (pe)
5827         {
5828           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5829           gcc_assert (!new_bb);
5830         }
5831       else
5832          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5833 
5834       msq_init = gimple_assign_lhs (new_stmt);
5835     }
5836 
5837   /* 4. Create realignment token using a target builtin, if available.
5838       It is done either inside the containing loop, or before LOOP (as
5839       determined above).  */
5840 
5841   if (targetm.vectorize.builtin_mask_for_load)
5842     {
5843       gcall *new_stmt;
5844       tree builtin_decl;
5845 
5846       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5847       if (!init_addr)
5848 	{
5849 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5850 	  init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5851 							    stmt_info, &stmts,
5852 							    NULL_TREE);
5853           if (loop)
5854             {
5855    	      pe = loop_preheader_edge (loop);
5856 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5857 	      gcc_assert (!new_bb);
5858             }
5859           else
5860              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5861 	}
5862 
5863       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5864       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5865       vec_dest =
5866 	vect_create_destination_var (scalar_dest,
5867 				     gimple_call_return_type (new_stmt));
5868       new_temp = make_ssa_name (vec_dest, new_stmt);
5869       gimple_call_set_lhs (new_stmt, new_temp);
5870 
5871       if (compute_in_loop)
5872 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5873       else
5874 	{
5875 	  /* Generate the misalignment computation outside LOOP.  */
5876 	  pe = loop_preheader_edge (loop);
5877 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5878 	  gcc_assert (!new_bb);
5879 	}
5880 
5881       *realignment_token = gimple_call_lhs (new_stmt);
5882 
5883       /* The result of the CALL_EXPR to this builtin is determined from
5884          the value of the parameter and no global variables are touched
5885          which makes the builtin a "const" function.  Requiring the
5886          builtin to have the "const" attribute makes it unnecessary
5887          to call mark_call_clobbered.  */
5888       gcc_assert (TREE_READONLY (builtin_decl));
5889     }
5890 
5891   if (alignment_support_scheme == dr_explicit_realign)
5892     return msq;
5893 
5894   gcc_assert (!compute_in_loop);
5895   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5896 
5897 
5898   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5899 
5900   pe = loop_preheader_edge (containing_loop);
5901   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5902   msq = make_ssa_name (vec_dest);
5903   phi_stmt = create_phi_node (msq, containing_loop->header);
5904   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5905 
5906   return msq;
5907 }
5908 
5909 
5910 /* Function vect_grouped_load_supported.
5911 
5912    COUNT is the size of the load group (the number of statements plus the
5913    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5914    only one statement, with a gap of COUNT - 1.
5915 
5916    Returns true if a suitable permute exists.  */
5917 
5918 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5919 vect_grouped_load_supported (tree vectype, bool single_element_p,
5920 			     unsigned HOST_WIDE_INT count)
5921 {
5922   machine_mode mode = TYPE_MODE (vectype);
5923 
5924   /* If this is single-element interleaving with an element distance
5925      that leaves unused vector loads around punt - we at least create
5926      very sub-optimal code in that case (and blow up memory,
5927      see PR65518).  */
5928   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5929     {
5930       if (dump_enabled_p ())
5931 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5932 			 "single-element interleaving not supported "
5933 			 "for not adjacent vector loads\n");
5934       return false;
5935     }
5936 
5937   /* vect_permute_load_chain requires the group size to be equal to 3 or
5938      be a power of two.  */
5939   if (count != 3 && exact_log2 (count) == -1)
5940     {
5941       if (dump_enabled_p ())
5942 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5943 			 "the size of the group of accesses"
5944 			 " is not a power of 2 or not equal to 3\n");
5945       return false;
5946     }
5947 
5948   /* Check that the permutation is supported.  */
5949   if (VECTOR_MODE_P (mode))
5950     {
5951       unsigned int i, j;
5952       if (count == 3)
5953 	{
5954 	  unsigned int nelt;
5955 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5956 	    {
5957 	      if (dump_enabled_p ())
5958 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5959 				 "cannot handle groups of 3 loads for"
5960 				 " variable-length vectors\n");
5961 	      return false;
5962 	    }
5963 
5964 	  vec_perm_builder sel (nelt, nelt, 1);
5965 	  sel.quick_grow (nelt);
5966 	  vec_perm_indices indices;
5967 	  unsigned int k;
5968 	  for (k = 0; k < 3; k++)
5969 	    {
5970 	      for (i = 0; i < nelt; i++)
5971 		if (3 * i + k < 2 * nelt)
5972 		  sel[i] = 3 * i + k;
5973 		else
5974 		  sel[i] = 0;
5975 	      indices.new_vector (sel, 2, nelt);
5976 	      if (!can_vec_perm_const_p (mode, indices))
5977 		{
5978 		  if (dump_enabled_p ())
5979 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5980 				     "shuffle of 3 loads is not supported by"
5981 				     " target\n");
5982 		  return false;
5983 		}
5984 	      for (i = 0, j = 0; i < nelt; i++)
5985 		if (3 * i + k < 2 * nelt)
5986 		  sel[i] = i;
5987 		else
5988 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5989 	      indices.new_vector (sel, 2, nelt);
5990 	      if (!can_vec_perm_const_p (mode, indices))
5991 		{
5992 		  if (dump_enabled_p ())
5993 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5994 				     "shuffle of 3 loads is not supported by"
5995 				     " target\n");
5996 		  return false;
5997 		}
5998 	    }
5999 	  return true;
6000 	}
6001       else
6002 	{
6003 	  /* If length is not equal to 3 then only power of 2 is supported.  */
6004 	  gcc_assert (pow2p_hwi (count));
6005 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
6006 
6007 	  /* The encoding has a single stepped pattern.  */
6008 	  vec_perm_builder sel (nelt, 1, 3);
6009 	  sel.quick_grow (3);
6010 	  for (i = 0; i < 3; i++)
6011 	    sel[i] = i * 2;
6012 	  vec_perm_indices indices (sel, 2, nelt);
6013 	  if (can_vec_perm_const_p (mode, indices))
6014 	    {
6015 	      for (i = 0; i < 3; i++)
6016 		sel[i] = i * 2 + 1;
6017 	      indices.new_vector (sel, 2, nelt);
6018 	      if (can_vec_perm_const_p (mode, indices))
6019 		return true;
6020 	    }
6021         }
6022     }
6023 
6024   if (dump_enabled_p ())
6025     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026 		     "extract even/odd not supported by target\n");
6027   return false;
6028 }
6029 
6030 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6031    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6032 
6033 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)6034 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6035 			   bool masked_p)
6036 {
6037   if (masked_p)
6038     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6039 					 vec_mask_load_lanes_optab,
6040 					 vectype, count);
6041   else
6042     return vect_lanes_optab_supported_p ("vec_load_lanes",
6043 					 vec_load_lanes_optab,
6044 					 vectype, count);
6045 }
6046 
6047 /* Function vect_permute_load_chain.
6048 
6049    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6050    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6051    the input data correctly.  Return the final references for loads in
6052    RESULT_CHAIN.
6053 
6054    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6055    The input is 4 vectors each containing 8 elements. We assign a number to each
6056    element, the input sequence is:
6057 
6058    1st vec:   0  1  2  3  4  5  6  7
6059    2nd vec:   8  9 10 11 12 13 14 15
6060    3rd vec:  16 17 18 19 20 21 22 23
6061    4th vec:  24 25 26 27 28 29 30 31
6062 
6063    The output sequence should be:
6064 
6065    1st vec:  0 4  8 12 16 20 24 28
6066    2nd vec:  1 5  9 13 17 21 25 29
6067    3rd vec:  2 6 10 14 18 22 26 30
6068    4th vec:  3 7 11 15 19 23 27 31
6069 
6070    i.e., the first output vector should contain the first elements of each
6071    interleaving group, etc.
6072 
6073    We use extract_even/odd instructions to create such output.  The input of
6074    each extract_even/odd operation is two vectors
6075    1st vec    2nd vec
6076    0 1 2 3    4 5 6 7
6077 
6078    and the output is the vector of extracted even/odd elements.  The output of
6079    extract_even will be:   0 2 4 6
6080    and of extract_odd:     1 3 5 7
6081 
6082 
6083    The permutation is done in log LENGTH stages.  In each stage extract_even
6084    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6085    their order.  In our example,
6086 
6087    E1: extract_even (1st vec, 2nd vec)
6088    E2: extract_odd (1st vec, 2nd vec)
6089    E3: extract_even (3rd vec, 4th vec)
6090    E4: extract_odd (3rd vec, 4th vec)
6091 
6092    The output for the first stage will be:
6093 
6094    E1:  0  2  4  6  8 10 12 14
6095    E2:  1  3  5  7  9 11 13 15
6096    E3: 16 18 20 22 24 26 28 30
6097    E4: 17 19 21 23 25 27 29 31
6098 
6099    In order to proceed and create the correct sequence for the next stage (or
6100    for the correct output, if the second stage is the last one, as in our
6101    example), we first put the output of extract_even operation and then the
6102    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6103    The input for the second stage is:
6104 
6105    1st vec (E1):  0  2  4  6  8 10 12 14
6106    2nd vec (E3): 16 18 20 22 24 26 28 30
6107    3rd vec (E2):  1  3  5  7  9 11 13 15
6108    4th vec (E4): 17 19 21 23 25 27 29 31
6109 
6110    The output of the second stage:
6111 
6112    E1: 0 4  8 12 16 20 24 28
6113    E2: 2 6 10 14 18 22 26 30
6114    E3: 1 5  9 13 17 21 25 29
6115    E4: 3 7 11 15 19 23 27 31
6116 
6117    And RESULT_CHAIN after reordering:
6118 
6119    1st vec (E1):  0 4  8 12 16 20 24 28
6120    2nd vec (E3):  1 5  9 13 17 21 25 29
6121    3rd vec (E2):  2 6 10 14 18 22 26 30
6122    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6123 
6124 static void
vect_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6125 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6126 			 unsigned int length,
6127 			 stmt_vec_info stmt_info,
6128 			 gimple_stmt_iterator *gsi,
6129 			 vec<tree> *result_chain)
6130 {
6131   tree data_ref, first_vect, second_vect;
6132   tree perm_mask_even, perm_mask_odd;
6133   tree perm3_mask_low, perm3_mask_high;
6134   gimple *perm_stmt;
6135   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6136   unsigned int i, j, log_length = exact_log2 (length);
6137 
6138   result_chain->quick_grow (length);
6139   memcpy (result_chain->address (), dr_chain.address (),
6140 	  length * sizeof (tree));
6141 
6142   if (length == 3)
6143     {
6144       /* vect_grouped_load_supported ensures that this is constant.  */
6145       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6146       unsigned int k;
6147 
6148       vec_perm_builder sel (nelt, nelt, 1);
6149       sel.quick_grow (nelt);
6150       vec_perm_indices indices;
6151       for (k = 0; k < 3; k++)
6152 	{
6153 	  for (i = 0; i < nelt; i++)
6154 	    if (3 * i + k < 2 * nelt)
6155 	      sel[i] = 3 * i + k;
6156 	    else
6157 	      sel[i] = 0;
6158 	  indices.new_vector (sel, 2, nelt);
6159 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6160 
6161 	  for (i = 0, j = 0; i < nelt; i++)
6162 	    if (3 * i + k < 2 * nelt)
6163 	      sel[i] = i;
6164 	    else
6165 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6166 	  indices.new_vector (sel, 2, nelt);
6167 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6168 
6169 	  first_vect = dr_chain[0];
6170 	  second_vect = dr_chain[1];
6171 
6172 	  /* Create interleaving stmt (low part of):
6173 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6174 							     ...}>  */
6175 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6176 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6177 					   second_vect, perm3_mask_low);
6178 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6179 
6180 	  /* Create interleaving stmt (high part of):
6181 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6182 							      ...}>  */
6183 	  first_vect = data_ref;
6184 	  second_vect = dr_chain[2];
6185 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6186 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6187 					   second_vect, perm3_mask_high);
6188 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6189 	  (*result_chain)[k] = data_ref;
6190 	}
6191     }
6192   else
6193     {
6194       /* If length is not equal to 3 then only power of 2 is supported.  */
6195       gcc_assert (pow2p_hwi (length));
6196 
6197       /* The encoding has a single stepped pattern.  */
6198       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6199       vec_perm_builder sel (nelt, 1, 3);
6200       sel.quick_grow (3);
6201       for (i = 0; i < 3; ++i)
6202 	sel[i] = i * 2;
6203       vec_perm_indices indices (sel, 2, nelt);
6204       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6205 
6206       for (i = 0; i < 3; ++i)
6207 	sel[i] = i * 2 + 1;
6208       indices.new_vector (sel, 2, nelt);
6209       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6210 
6211       for (i = 0; i < log_length; i++)
6212 	{
6213 	  for (j = 0; j < length; j += 2)
6214 	    {
6215 	      first_vect = dr_chain[j];
6216 	      second_vect = dr_chain[j+1];
6217 
6218 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6219 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6220 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6221 					       first_vect, second_vect,
6222 					       perm_mask_even);
6223 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6224 	      (*result_chain)[j/2] = data_ref;
6225 
6226 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6227 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6228 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6229 					       first_vect, second_vect,
6230 					       perm_mask_odd);
6231 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6232 	      (*result_chain)[j/2+length/2] = data_ref;
6233 	    }
6234 	  memcpy (dr_chain.address (), result_chain->address (),
6235 		  length * sizeof (tree));
6236 	}
6237     }
6238 }
6239 
6240 /* Function vect_shift_permute_load_chain.
6241 
6242    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6243    sequence of stmts to reorder the input data accordingly.
6244    Return the final references for loads in RESULT_CHAIN.
6245    Return true if successed, false otherwise.
6246 
6247    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6248    The input is 3 vectors each containing 8 elements.  We assign a
6249    number to each element, the input sequence is:
6250 
6251    1st vec:   0  1  2  3  4  5  6  7
6252    2nd vec:   8  9 10 11 12 13 14 15
6253    3rd vec:  16 17 18 19 20 21 22 23
6254 
6255    The output sequence should be:
6256 
6257    1st vec:  0 3 6  9 12 15 18 21
6258    2nd vec:  1 4 7 10 13 16 19 22
6259    3rd vec:  2 5 8 11 14 17 20 23
6260 
6261    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6262 
6263    First we shuffle all 3 vectors to get correct elements order:
6264 
6265    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6266    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6267    3rd vec:  (16 19 22) (17 20 23) (18 21)
6268 
6269    Next we unite and shift vector 3 times:
6270 
6271    1st step:
6272      shift right by 6 the concatenation of:
6273      "1st vec" and  "2nd vec"
6274        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6275      "2nd vec" and  "3rd vec"
6276        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6277      "3rd vec" and  "1st vec"
6278        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6279 			     | New vectors                   |
6280 
6281      So that now new vectors are:
6282 
6283      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6284      2nd vec:  (10 13) (16 19 22) (17 20 23)
6285      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6286 
6287    2nd step:
6288      shift right by 5 the concatenation of:
6289      "1st vec" and  "3rd vec"
6290        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6291      "2nd vec" and  "1st vec"
6292        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6293      "3rd vec" and  "2nd vec"
6294        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6295 			  | New vectors                   |
6296 
6297      So that now new vectors are:
6298 
6299      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6300      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6301      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6302 
6303    3rd step:
6304      shift right by 5 the concatenation of:
6305      "1st vec" and  "1st vec"
6306        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6307      shift right by 3 the concatenation of:
6308      "2nd vec" and  "2nd vec"
6309                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6310 			  | New vectors                   |
6311 
6312      So that now all vectors are READY:
6313      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6314      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6315      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6316 
6317    This algorithm is faster than one in vect_permute_load_chain if:
6318      1.  "shift of a concatination" is faster than general permutation.
6319 	 This is usually so.
6320      2.  The TARGET machine can't execute vector instructions in parallel.
6321 	 This is because each step of the algorithm depends on previous.
6322 	 The algorithm in vect_permute_load_chain is much more parallel.
6323 
6324    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6325 */
6326 
6327 static bool
vect_shift_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6328 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6329 			       unsigned int length,
6330 			       stmt_vec_info stmt_info,
6331 			       gimple_stmt_iterator *gsi,
6332 			       vec<tree> *result_chain)
6333 {
6334   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6335   tree perm2_mask1, perm2_mask2, perm3_mask;
6336   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6337   gimple *perm_stmt;
6338 
6339   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6340   unsigned int i;
6341   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6342 
6343   unsigned HOST_WIDE_INT nelt, vf;
6344   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6345       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6346     /* Not supported for variable-length vectors.  */
6347     return false;
6348 
6349   vec_perm_builder sel (nelt, nelt, 1);
6350   sel.quick_grow (nelt);
6351 
6352   result_chain->quick_grow (length);
6353   memcpy (result_chain->address (), dr_chain.address (),
6354 	  length * sizeof (tree));
6355 
6356   if (pow2p_hwi (length) && vf > 4)
6357     {
6358       unsigned int j, log_length = exact_log2 (length);
6359       for (i = 0; i < nelt / 2; ++i)
6360 	sel[i] = i * 2;
6361       for (i = 0; i < nelt / 2; ++i)
6362 	sel[nelt / 2 + i] = i * 2 + 1;
6363       vec_perm_indices indices (sel, 2, nelt);
6364       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6365 	{
6366 	  if (dump_enabled_p ())
6367 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368 			     "shuffle of 2 fields structure is not \
6369 			      supported by target\n");
6370 	  return false;
6371 	}
6372       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6373 
6374       for (i = 0; i < nelt / 2; ++i)
6375 	sel[i] = i * 2 + 1;
6376       for (i = 0; i < nelt / 2; ++i)
6377 	sel[nelt / 2 + i] = i * 2;
6378       indices.new_vector (sel, 2, nelt);
6379       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6380 	{
6381 	  if (dump_enabled_p ())
6382 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6383 			     "shuffle of 2 fields structure is not \
6384 			      supported by target\n");
6385 	  return false;
6386 	}
6387       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6388 
6389       /* Generating permutation constant to shift all elements.
6390 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6391       for (i = 0; i < nelt; i++)
6392 	sel[i] = nelt / 2 + i;
6393       indices.new_vector (sel, 2, nelt);
6394       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6395 	{
6396 	  if (dump_enabled_p ())
6397 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398 			     "shift permutation is not supported by target\n");
6399 	  return false;
6400 	}
6401       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6402 
6403       /* Generating permutation constant to select vector from 2.
6404 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6405       for (i = 0; i < nelt / 2; i++)
6406 	sel[i] = i;
6407       for (i = nelt / 2; i < nelt; i++)
6408 	sel[i] = nelt + i;
6409       indices.new_vector (sel, 2, nelt);
6410       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6411 	{
6412 	  if (dump_enabled_p ())
6413 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414 			     "select is not supported by target\n");
6415 	  return false;
6416 	}
6417       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6418 
6419       for (i = 0; i < log_length; i++)
6420 	{
6421 	  for (j = 0; j < length; j += 2)
6422 	    {
6423 	      first_vect = dr_chain[j];
6424 	      second_vect = dr_chain[j + 1];
6425 
6426 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6427 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6428 					       first_vect, first_vect,
6429 					       perm2_mask1);
6430 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6431 	      vect[0] = data_ref;
6432 
6433 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6434 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6435 					       second_vect, second_vect,
6436 					       perm2_mask2);
6437 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6438 	      vect[1] = data_ref;
6439 
6440 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6441 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6442 					       vect[0], vect[1], shift1_mask);
6443 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6444 	      (*result_chain)[j/2 + length/2] = data_ref;
6445 
6446 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6447 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6448 					       vect[0], vect[1], select_mask);
6449 	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6450 	      (*result_chain)[j/2] = data_ref;
6451 	    }
6452 	  memcpy (dr_chain.address (), result_chain->address (),
6453 		  length * sizeof (tree));
6454 	}
6455       return true;
6456     }
6457   if (length == 3 && vf > 2)
6458     {
6459       unsigned int k = 0, l = 0;
6460 
6461       /* Generating permutation constant to get all elements in rigth order.
6462 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6463       for (i = 0; i < nelt; i++)
6464 	{
6465 	  if (3 * k + (l % 3) >= nelt)
6466 	    {
6467 	      k = 0;
6468 	      l += (3 - (nelt % 3));
6469 	    }
6470 	  sel[i] = 3 * k + (l % 3);
6471 	  k++;
6472 	}
6473       vec_perm_indices indices (sel, 2, nelt);
6474       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6475 	{
6476 	  if (dump_enabled_p ())
6477 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6478 			     "shuffle of 3 fields structure is not \
6479 			      supported by target\n");
6480 	  return false;
6481 	}
6482       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6483 
6484       /* Generating permutation constant to shift all elements.
6485 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6486       for (i = 0; i < nelt; i++)
6487 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6488       indices.new_vector (sel, 2, nelt);
6489       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6490 	{
6491 	  if (dump_enabled_p ())
6492 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 			     "shift permutation is not supported by target\n");
6494 	  return false;
6495 	}
6496       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6497 
6498       /* Generating permutation constant to shift all elements.
6499 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6500       for (i = 0; i < nelt; i++)
6501 	sel[i] = 2 * (nelt / 3) + 1 + i;
6502       indices.new_vector (sel, 2, nelt);
6503       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6504 	{
6505 	  if (dump_enabled_p ())
6506 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507 			     "shift permutation is not supported by target\n");
6508 	  return false;
6509 	}
6510       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6511 
6512       /* Generating permutation constant to shift all elements.
6513 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6514       for (i = 0; i < nelt; i++)
6515 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6516       indices.new_vector (sel, 2, nelt);
6517       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6518 	{
6519 	  if (dump_enabled_p ())
6520 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521 			     "shift permutation is not supported by target\n");
6522 	  return false;
6523 	}
6524       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6525 
6526       /* Generating permutation constant to shift all elements.
6527 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6528       for (i = 0; i < nelt; i++)
6529 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6530       indices.new_vector (sel, 2, nelt);
6531       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6532 	{
6533 	  if (dump_enabled_p ())
6534 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535 			     "shift permutation is not supported by target\n");
6536 	  return false;
6537 	}
6538       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6539 
6540       for (k = 0; k < 3; k++)
6541 	{
6542 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6543 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544 					   dr_chain[k], dr_chain[k],
6545 					   perm3_mask);
6546 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6547 	  vect[k] = data_ref;
6548 	}
6549 
6550       for (k = 0; k < 3; k++)
6551 	{
6552 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6553 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6554 					   vect[k % 3], vect[(k + 1) % 3],
6555 					   shift1_mask);
6556 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6557 	  vect_shift[k] = data_ref;
6558 	}
6559 
6560       for (k = 0; k < 3; k++)
6561 	{
6562 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6563 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6564 					   vect_shift[(4 - k) % 3],
6565 					   vect_shift[(3 - k) % 3],
6566 					   shift2_mask);
6567 	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6568 	  vect[k] = data_ref;
6569 	}
6570 
6571       (*result_chain)[3 - (nelt % 3)] = vect[2];
6572 
6573       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6574       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6575 				       vect[0], shift3_mask);
6576       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6577       (*result_chain)[nelt % 3] = data_ref;
6578 
6579       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6580       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6581 				       vect[1], shift4_mask);
6582       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6583       (*result_chain)[0] = data_ref;
6584       return true;
6585     }
6586   return false;
6587 }
6588 
6589 /* Function vect_transform_grouped_load.
6590 
6591    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6592    to perform their permutation and ascribe the result vectorized statements to
6593    the scalar statements.
6594 */
6595 
6596 void
vect_transform_grouped_load(vec_info * vinfo,stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6597 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6598 			     vec<tree> dr_chain,
6599 			     int size, gimple_stmt_iterator *gsi)
6600 {
6601   machine_mode mode;
6602   vec<tree> result_chain = vNULL;
6603 
6604   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6605      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6606      vectors, that are ready for vector computation.  */
6607   result_chain.create (size);
6608 
6609   /* If reassociation width for vector type is 2 or greater target machine can
6610      execute 2 or more vector instructions in parallel.  Otherwise try to
6611      get chain for loads group using vect_shift_permute_load_chain.  */
6612   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6613   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6614       || pow2p_hwi (size)
6615       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6616 					 gsi, &result_chain))
6617     vect_permute_load_chain (vinfo, dr_chain,
6618 			     size, stmt_info, gsi, &result_chain);
6619   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6620   result_chain.release ();
6621 }
6622 
6623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6624    generated as part of the vectorization of STMT_INFO.  Assign the statement
6625    for each vector to the associated scalar statement.  */
6626 
6627 void
vect_record_grouped_load_vectors(vec_info *,stmt_vec_info stmt_info,vec<tree> result_chain)6628 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6629 				  vec<tree> result_chain)
6630 {
6631   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6632   unsigned int i, gap_count;
6633   tree tmp_data_ref;
6634 
6635   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6636      Since we scan the chain starting from it's first node, their order
6637      corresponds the order of data-refs in RESULT_CHAIN.  */
6638   stmt_vec_info next_stmt_info = first_stmt_info;
6639   gap_count = 1;
6640   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6641     {
6642       if (!next_stmt_info)
6643 	break;
6644 
6645       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6646        code elimination pass later.  No need to check for the first stmt in
6647        the group, since it always exists.
6648        DR_GROUP_GAP is the number of steps in elements from the previous
6649        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6650        correspond to the gaps.  */
6651       if (next_stmt_info != first_stmt_info
6652 	  && gap_count < DR_GROUP_GAP (next_stmt_info))
6653 	{
6654 	  gap_count++;
6655 	  continue;
6656 	}
6657 
6658       /* ???  The following needs cleanup after the removal of
6659          DR_GROUP_SAME_DR_STMT.  */
6660       if (next_stmt_info)
6661         {
6662 	  gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6663 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6664 	     copies, and we put the new vector statement last.  */
6665 	  STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6666 
6667 	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6668 	  gap_count = 1;
6669         }
6670     }
6671 }
6672 
6673 /* Function vect_force_dr_alignment_p.
6674 
6675    Returns whether the alignment of a DECL can be forced to be aligned
6676    on ALIGNMENT bit boundary.  */
6677 
6678 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6679 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6680 {
6681   if (!VAR_P (decl))
6682     return false;
6683 
6684   if (decl_in_symtab_p (decl)
6685       && !symtab_node::get (decl)->can_increase_alignment_p ())
6686     return false;
6687 
6688   if (TREE_STATIC (decl))
6689     return (known_le (alignment,
6690 		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6691   else
6692     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6693 }
6694 
6695 /* Return whether the data reference DR_INFO is supported with respect to its
6696    alignment.
6697    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6698    it is aligned, i.e., check if it is possible to vectorize it with different
6699    alignment.  */
6700 
6701 enum dr_alignment_support
vect_supportable_dr_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype,int misalignment)6702 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6703 			       tree vectype, int misalignment)
6704 {
6705   data_reference *dr = dr_info->dr;
6706   stmt_vec_info stmt_info = dr_info->stmt;
6707   machine_mode mode = TYPE_MODE (vectype);
6708   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6709   class loop *vect_loop = NULL;
6710   bool nested_in_vect_loop = false;
6711 
6712   if (misalignment == 0)
6713     return dr_aligned;
6714 
6715   /* For now assume all conditional loads/stores support unaligned
6716      access without any special code.  */
6717   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6718     if (gimple_call_internal_p (stmt)
6719 	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6720 	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6721       return dr_unaligned_supported;
6722 
6723   if (loop_vinfo)
6724     {
6725       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6726       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6727     }
6728 
6729   /* Possibly unaligned access.  */
6730 
6731   /* We can choose between using the implicit realignment scheme (generating
6732      a misaligned_move stmt) and the explicit realignment scheme (generating
6733      aligned loads with a REALIGN_LOAD).  There are two variants to the
6734      explicit realignment scheme: optimized, and unoptimized.
6735      We can optimize the realignment only if the step between consecutive
6736      vector loads is equal to the vector size.  Since the vector memory
6737      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6738      is guaranteed that the misalignment amount remains the same throughout the
6739      execution of the vectorized loop.  Therefore, we can create the
6740      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6741      at the loop preheader.
6742 
6743      However, in the case of outer-loop vectorization, when vectorizing a
6744      memory access in the inner-loop nested within the LOOP that is now being
6745      vectorized, while it is guaranteed that the misalignment of the
6746      vectorized memory access will remain the same in different outer-loop
6747      iterations, it is *not* guaranteed that is will remain the same throughout
6748      the execution of the inner-loop.  This is because the inner-loop advances
6749      with the original scalar step (and not in steps of VS).  If the inner-loop
6750      step happens to be a multiple of VS, then the misalignment remains fixed
6751      and we can use the optimized realignment scheme.  For example:
6752 
6753       for (i=0; i<N; i++)
6754         for (j=0; j<M; j++)
6755           s += a[i+j];
6756 
6757      When vectorizing the i-loop in the above example, the step between
6758      consecutive vector loads is 1, and so the misalignment does not remain
6759      fixed across the execution of the inner-loop, and the realignment cannot
6760      be optimized (as illustrated in the following pseudo vectorized loop):
6761 
6762       for (i=0; i<N; i+=4)
6763         for (j=0; j<M; j++){
6764           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6765                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6766                          // (assuming that we start from an aligned address).
6767           }
6768 
6769      We therefore have to use the unoptimized realignment scheme:
6770 
6771       for (i=0; i<N; i+=4)
6772           for (j=k; j<M; j+=4)
6773           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6774                            // that the misalignment of the initial address is
6775                            // 0).
6776 
6777      The loop can then be vectorized as follows:
6778 
6779       for (k=0; k<4; k++){
6780         rt = get_realignment_token (&vp[k]);
6781         for (i=0; i<N; i+=4){
6782           v1 = vp[i+k];
6783           for (j=k; j<M; j+=4){
6784             v2 = vp[i+j+VS-1];
6785             va = REALIGN_LOAD <v1,v2,rt>;
6786             vs += va;
6787             v1 = v2;
6788           }
6789         }
6790     } */
6791 
6792   if (DR_IS_READ (dr))
6793     {
6794       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6795 	  && (!targetm.vectorize.builtin_mask_for_load
6796 	      || targetm.vectorize.builtin_mask_for_load ()))
6797 	{
6798 	  /* If we are doing SLP then the accesses need not have the
6799 	     same alignment, instead it depends on the SLP group size.  */
6800 	  if (loop_vinfo
6801 	      && STMT_SLP_TYPE (stmt_info)
6802 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6803 			      * (DR_GROUP_SIZE
6804 				 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6805 			      TYPE_VECTOR_SUBPARTS (vectype)))
6806 	    ;
6807 	  else if (!loop_vinfo
6808 		   || (nested_in_vect_loop
6809 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6810 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6811 	    return dr_explicit_realign;
6812 	  else
6813 	    return dr_explicit_realign_optimized;
6814 	}
6815     }
6816 
6817   bool is_packed = false;
6818   tree type = TREE_TYPE (DR_REF (dr));
6819   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6820     is_packed = not_size_aligned (DR_REF (dr));
6821   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6822 						     is_packed))
6823     return dr_unaligned_supported;
6824 
6825   /* Unsupported.  */
6826   return dr_unaligned_unsupported;
6827 }
6828