xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-vect-data-refs.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 
57 /* Return true if load- or store-lanes optab OPTAB is implemented for
58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
59 
60 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62 			      tree vectype, unsigned HOST_WIDE_INT count)
63 {
64   machine_mode mode, array_mode;
65   bool limit_p;
66 
67   mode = TYPE_MODE (vectype);
68   if (!targetm.array_mode (mode, count).exists (&array_mode))
69     {
70       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71       limit_p = !targetm.array_mode_supported_p (mode, count);
72       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73 	{
74 	  if (dump_enabled_p ())
75 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76 			     "no array mode for %s[%wu]\n",
77 			     GET_MODE_NAME (mode), count);
78 	  return false;
79 	}
80     }
81 
82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83     {
84       if (dump_enabled_p ())
85 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86                          "cannot use %s<%s><%s>\n", name,
87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88       return false;
89     }
90 
91   if (dump_enabled_p ())
92     dump_printf_loc (MSG_NOTE, vect_location,
93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94                      GET_MODE_NAME (mode));
95 
96   return true;
97 }
98 
99 
100 /* Return the smallest scalar part of STMT_INFO.
101    This is used to determine the vectype of the stmt.  We generally set the
102    vectype according to the type of the result (lhs).  For stmts whose
103    result-type is different than the type of the arguments (e.g., demotion,
104    promotion), vectype will be reset appropriately (later).  Note that we have
105    to visit the smallest datatype in this function, because that determines the
106    VF.  If the smallest datatype in the loop is present only as the rhs of a
107    promotion operation - we'd miss it.
108    Such a case, where a variable of this datatype does not appear in the lhs
109    anywhere in the loop, can only occur if it's an invariant: e.g.:
110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
111    invariant motion.  However, we cannot rely on invariant motion to always
112    take invariants out of the loop, and so in the case of promotion we also
113    have to check the rhs.
114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115    types.  */
116 
117 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)118 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119 			       HOST_WIDE_INT *lhs_size_unit,
120 			       HOST_WIDE_INT *rhs_size_unit)
121 {
122   tree scalar_type = gimple_expr_type (stmt_info->stmt);
123   HOST_WIDE_INT lhs, rhs;
124 
125   /* During the analysis phase, this function is called on arbitrary
126      statements that might not have scalar results.  */
127   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128     return scalar_type;
129 
130   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131 
132   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133   if (assign
134       && (gimple_assign_cast_p (assign)
135 	  || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 	  || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 	  || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 	  || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 	  || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
140     {
141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
142 
143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144       if (rhs < lhs)
145         scalar_type = rhs_type;
146     }
147   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
148     {
149       unsigned int i = 0;
150       if (gimple_call_internal_p (call))
151 	{
152 	  internal_fn ifn = gimple_call_internal_fn (call);
153 	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154 	    /* gimple_expr_type already picked the type of the loaded
155 	       or stored data.  */
156 	    i = ~0U;
157 	  else if (internal_fn_mask_index (ifn) == 0)
158 	    i = 1;
159 	}
160       if (i < gimple_call_num_args (call))
161 	{
162 	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163 	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164 	    {
165 	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166 	      if (rhs < lhs)
167 		scalar_type = rhs_type;
168 	    }
169 	}
170     }
171 
172   *lhs_size_unit = lhs;
173   *rhs_size_unit = rhs;
174   return scalar_type;
175 }
176 
177 
178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179    tested at run-time.  Return TRUE if DDR was successfully inserted.
180    Return false if versioning is not supported.  */
181 
182 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184 {
185   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186 
187   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188     return opt_result::failure_at (vect_location,
189 				   "will not create alias checks, as"
190 				   " --param vect-max-version-for-alias-checks"
191 				   " == 0\n");
192 
193   opt_result res
194     = runtime_alias_check_p (ddr, loop,
195 			     optimize_loop_nest_for_speed_p (loop));
196   if (!res)
197     return res;
198 
199   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200   return opt_result::success ();
201 }
202 
203 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
204 
205 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)206 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
207 {
208   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209   for (unsigned int i = 0; i < checks.length(); ++i)
210     if (checks[i] == value)
211       return;
212 
213   if (dump_enabled_p ())
214     dump_printf_loc (MSG_NOTE, vect_location,
215 		     "need run-time check that %T is nonzero\n",
216 		     value);
217   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
218 }
219 
220 /* Return true if we know that the order of vectorized DR_INFO_A and
221    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222    DR_INFO_B.  At least one of the accesses is a write.  */
223 
224 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)225 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
226 {
227   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
229 
230   /* Single statements are always kept in their original order.  */
231   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233     return true;
234 
235   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
236      SLP group are emitted at the position of the last scalar load and
237      all loads in an interleaving group are emitted at the position
238      of the first scalar load.
239      Stores in a group are emitted at the position of the last scalar store.
240      Compute that position and check whether the resulting order matches
241      the current one.
242      We have not yet decided between SLP and interleaving so we have
243      to conservatively assume both.  */
244   stmt_vec_info il_a;
245   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246   if (last_a)
247     {
248       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
249 	   s = DR_GROUP_NEXT_ELEMENT (s))
250 	last_a = get_later_stmt (last_a, s);
251       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252 	{
253 	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254 	       s = DR_GROUP_NEXT_ELEMENT (s))
255 	    if (get_later_stmt (il_a, s) == il_a)
256 	      il_a = s;
257 	}
258       else
259 	il_a = last_a;
260     }
261   else
262     last_a = il_a = stmtinfo_a;
263   stmt_vec_info il_b;
264   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
265   if (last_b)
266     {
267       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
268 	   s = DR_GROUP_NEXT_ELEMENT (s))
269 	last_b = get_later_stmt (last_b, s);
270       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
271 	{
272 	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
273 	       s = DR_GROUP_NEXT_ELEMENT (s))
274 	    if (get_later_stmt (il_b, s) == il_b)
275 	      il_b = s;
276 	}
277       else
278 	il_b = last_b;
279     }
280   else
281     last_b = il_b = stmtinfo_b;
282   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
283   return (/* SLP */
284 	  (get_later_stmt (last_a, last_b) == last_a) == a_after_b
285 	  /* Interleaving */
286 	  && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
287 	  /* Mixed */
288 	  && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
289 	  && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
290 }
291 
292 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
293    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294    distances.  These distances are conservatively correct but they don't
295    reflect a guaranteed dependence.
296 
297    Return true if this function does all the work necessary to avoid
298    an alias or false if the caller should use the dependence distances
299    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
300    the depth of the loop described by LOOP_VINFO and the other arguments
301    are as for vect_analyze_data_ref_dependence.  */
302 
303 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)304 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
305 				       loop_vec_info loop_vinfo,
306 				       int loop_depth, unsigned int *max_vf)
307 {
308   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309   lambda_vector dist_v;
310   unsigned int i;
311   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
312     {
313       int dist = dist_v[loop_depth];
314       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
315 	{
316 	  /* If the user asserted safelen >= DIST consecutive iterations
317 	     can be executed concurrently, assume independence.
318 
319 	     ??? An alternative would be to add the alias check even
320 	     in this case, and vectorize the fallback loop with the
321 	     maximum VF set to safelen.  However, if the user has
322 	     explicitly given a length, it's less likely that that
323 	     would be a win.  */
324 	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
325 	    {
326 	      if ((unsigned int) loop->safelen < *max_vf)
327 		*max_vf = loop->safelen;
328 	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
329 	      continue;
330 	    }
331 
332 	  /* For dependence distances of 2 or more, we have the option
333 	     of limiting VF or checking for an alias at runtime.
334 	     Prefer to check at runtime if we can, to avoid limiting
335 	     the VF unnecessarily when the bases are in fact independent.
336 
337 	     Note that the alias checks will be removed if the VF ends up
338 	     being small enough.  */
339 	  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
340 	  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
341 	  return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
342 		  && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
343 		  && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
344 	}
345     }
346   return true;
347 }
348 
349 
350 /* Function vect_analyze_data_ref_dependence.
351 
352    FIXME: I needed to change the sense of the returned flag.
353 
354    Return FALSE if there (might) exist a dependence between a memory-reference
355    DRA and a memory-reference DRB.  When versioning for alias may check a
356    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
357    the data dependence.  */
358 
359 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)360 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
361 				  loop_vec_info loop_vinfo,
362 				  unsigned int *max_vf)
363 {
364   unsigned int i;
365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
366   struct data_reference *dra = DDR_A (ddr);
367   struct data_reference *drb = DDR_B (ddr);
368   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
369   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
370   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
371   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
372   lambda_vector dist_v;
373   unsigned int loop_depth;
374 
375   /* In loop analysis all data references should be vectorizable.  */
376   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
377       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
378     gcc_unreachable ();
379 
380   /* Independent data accesses.  */
381   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
382     return opt_result::success ();
383 
384   if (dra == drb
385       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
386     return opt_result::success ();
387 
388   /* We do not have to consider dependences between accesses that belong
389      to the same group, unless the stride could be smaller than the
390      group size.  */
391   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 	  == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
394       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
395     return opt_result::success ();
396 
397   /* Even if we have an anti-dependence then, as the vectorized loop covers at
398      least two scalar iterations, there is always also a true dependence.
399      As the vectorizer does not re-order loads and stores we can ignore
400      the anti-dependence if TBAA can disambiguate both DRs similar to the
401      case with known negative distance anti-dependences (positive
402      distance anti-dependences would violate TBAA constraints).  */
403   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
404        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
405       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
406 				 get_alias_set (DR_REF (drb))))
407     return opt_result::success ();
408 
409   /* Unknown data dependence.  */
410   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
411     {
412       /* If user asserted safelen consecutive iterations can be
413 	 executed concurrently, assume independence.  */
414       if (loop->safelen >= 2)
415 	{
416 	  if ((unsigned int) loop->safelen < *max_vf)
417 	    *max_vf = loop->safelen;
418 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
419 	  return opt_result::success ();
420 	}
421 
422       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
423 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
424 	return opt_result::failure_at
425 	  (stmtinfo_a->stmt,
426 	   "versioning for alias not supported for: "
427 	   "can't determine dependence between %T and %T\n",
428 	   DR_REF (dra), DR_REF (drb));
429 
430       if (dump_enabled_p ())
431 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
432 			 "versioning for alias required: "
433 			 "can't determine dependence between %T and %T\n",
434 			 DR_REF (dra), DR_REF (drb));
435 
436       /* Add to list of ddrs that need to be tested at run-time.  */
437       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
438     }
439 
440   /* Known data dependence.  */
441   if (DDR_NUM_DIST_VECTS (ddr) == 0)
442     {
443       /* If user asserted safelen consecutive iterations can be
444 	 executed concurrently, assume independence.  */
445       if (loop->safelen >= 2)
446 	{
447 	  if ((unsigned int) loop->safelen < *max_vf)
448 	    *max_vf = loop->safelen;
449 	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
450 	  return opt_result::success ();
451 	}
452 
453       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455 	return opt_result::failure_at
456 	  (stmtinfo_a->stmt,
457 	   "versioning for alias not supported for: "
458 	   "bad dist vector for %T and %T\n",
459 	   DR_REF (dra), DR_REF (drb));
460 
461       if (dump_enabled_p ())
462 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
463 			 "versioning for alias required: "
464 			 "bad dist vector for %T and %T\n",
465 			 DR_REF (dra), DR_REF (drb));
466       /* Add to list of ddrs that need to be tested at run-time.  */
467       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
468     }
469 
470   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
471 
472   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
473       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
474 						loop_depth, max_vf))
475     return opt_result::success ();
476 
477   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
478     {
479       int dist = dist_v[loop_depth];
480 
481       if (dump_enabled_p ())
482 	dump_printf_loc (MSG_NOTE, vect_location,
483                          "dependence distance  = %d.\n", dist);
484 
485       if (dist == 0)
486 	{
487 	  if (dump_enabled_p ())
488 	    dump_printf_loc (MSG_NOTE, vect_location,
489 			     "dependence distance == 0 between %T and %T\n",
490 			     DR_REF (dra), DR_REF (drb));
491 
492 	  /* When we perform grouped accesses and perform implicit CSE
493 	     by detecting equal accesses and doing disambiguation with
494 	     runtime alias tests like for
495 	        .. = a[i];
496 		.. = a[i+1];
497 		a[i] = ..;
498 		a[i+1] = ..;
499 		*p = ..;
500 		.. = a[i];
501 		.. = a[i+1];
502 	     where we will end up loading { a[i], a[i+1] } once, make
503 	     sure that inserting group loads before the first load and
504 	     stores after the last store will do the right thing.
505 	     Similar for groups like
506 	        a[i] = ...;
507 		... = a[i];
508 		a[i+1] = ...;
509 	     where loads from the group interleave with the store.  */
510 	  if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
511 	    return opt_result::failure_at (stmtinfo_a->stmt,
512 					   "READ_WRITE dependence"
513 					   " in interleaving.\n");
514 
515 	  if (loop->safelen < 2)
516 	    {
517 	      tree indicator = dr_zero_step_indicator (dra);
518 	      if (!indicator || integer_zerop (indicator))
519 		return opt_result::failure_at (stmtinfo_a->stmt,
520 					       "access also has a zero step\n");
521 	      else if (TREE_CODE (indicator) != INTEGER_CST)
522 		vect_check_nonzero_value (loop_vinfo, indicator);
523 	    }
524 	  continue;
525 	}
526 
527       if (dist > 0 && DDR_REVERSED_P (ddr))
528 	{
529 	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
530 	     reversed (to make distance vector positive), and the actual
531 	     distance is negative.  */
532 	  if (dump_enabled_p ())
533 	    dump_printf_loc (MSG_NOTE, vect_location,
534 	                     "dependence distance negative.\n");
535 	  /* When doing outer loop vectorization, we need to check if there is
536 	     a backward dependence at the inner loop level if the dependence
537 	     at the outer loop is reversed.  See PR81740.  */
538 	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
539 	      || nested_in_vect_loop_p (loop, stmtinfo_b))
540 	    {
541 	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
542 							 DDR_LOOP_NEST (ddr));
543 	      if (dist_v[inner_depth] < 0)
544 		return opt_result::failure_at (stmtinfo_a->stmt,
545 					       "not vectorized, dependence "
546 					       "between data-refs %T and %T\n",
547 					       DR_REF (dra), DR_REF (drb));
548 	    }
549 	  /* Record a negative dependence distance to later limit the
550 	     amount of stmt copying / unrolling we can perform.
551 	     Only need to handle read-after-write dependence.  */
552 	  if (DR_IS_READ (drb)
553 	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
554 		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
555 	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
556 	  continue;
557 	}
558 
559       unsigned int abs_dist = abs (dist);
560       if (abs_dist >= 2 && abs_dist < *max_vf)
561 	{
562 	  /* The dependence distance requires reduction of the maximal
563 	     vectorization factor.  */
564 	  *max_vf = abs_dist;
565 	  if (dump_enabled_p ())
566 	    dump_printf_loc (MSG_NOTE, vect_location,
567 	                     "adjusting maximal vectorization factor to %i\n",
568 	                     *max_vf);
569 	}
570 
571       if (abs_dist >= *max_vf)
572 	{
573 	  /* Dependence distance does not create dependence, as far as
574 	     vectorization is concerned, in this case.  */
575 	  if (dump_enabled_p ())
576 	    dump_printf_loc (MSG_NOTE, vect_location,
577 	                     "dependence distance >= VF.\n");
578 	  continue;
579 	}
580 
581       return opt_result::failure_at (stmtinfo_a->stmt,
582 				     "not vectorized, possible dependence "
583 				     "between data-refs %T and %T\n",
584 				     DR_REF (dra), DR_REF (drb));
585     }
586 
587   return opt_result::success ();
588 }
589 
590 /* Function vect_analyze_data_ref_dependences.
591 
592    Examine all the data references in the loop, and make sure there do not
593    exist any data dependences between them.  Set *MAX_VF according to
594    the maximum vectorization factor the data dependences allow.  */
595 
596 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)597 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
598 				   unsigned int *max_vf)
599 {
600   unsigned int i;
601   struct data_dependence_relation *ddr;
602 
603   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
604 
605   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
606     {
607       LOOP_VINFO_DDRS (loop_vinfo)
608 	.create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
609 		 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
610       /* We need read-read dependences to compute
611 	 STMT_VINFO_SAME_ALIGN_REFS.  */
612       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
613 					  &LOOP_VINFO_DDRS (loop_vinfo),
614 					  LOOP_VINFO_LOOP_NEST (loop_vinfo),
615 					  true);
616       gcc_assert (res);
617     }
618 
619   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
620 
621   /* For epilogues we either have no aliases or alias versioning
622      was applied to original loop.  Therefore we may just get max_vf
623      using VF of original loop.  */
624   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
625     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
626   else
627     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
628       {
629 	opt_result res
630 	  = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
631 	if (!res)
632 	  return res;
633       }
634 
635   return opt_result::success ();
636 }
637 
638 
639 /* Function vect_slp_analyze_data_ref_dependence.
640 
641    Return TRUE if there (might) exist a dependence between a memory-reference
642    DRA and a memory-reference DRB for VINFO.  When versioning for alias
643    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
644    according to the data dependence.  */
645 
646 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)647 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
648 				      struct data_dependence_relation *ddr)
649 {
650   struct data_reference *dra = DDR_A (ddr);
651   struct data_reference *drb = DDR_B (ddr);
652   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
653   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
654 
655   /* We need to check dependences of statements marked as unvectorizable
656      as well, they still can prohibit vectorization.  */
657 
658   /* Independent data accesses.  */
659   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
660     return false;
661 
662   if (dra == drb)
663     return false;
664 
665   /* Read-read is OK.  */
666   if (DR_IS_READ (dra) && DR_IS_READ (drb))
667     return false;
668 
669   /* If dra and drb are part of the same interleaving chain consider
670      them independent.  */
671   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
672       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
673 	  == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
674     return false;
675 
676   /* Unknown data dependence.  */
677   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
678     {
679       if  (dump_enabled_p ())
680 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681 			 "can't determine dependence between %T and %T\n",
682 			 DR_REF (dra), DR_REF (drb));
683     }
684   else if (dump_enabled_p ())
685     dump_printf_loc (MSG_NOTE, vect_location,
686 		     "determined dependence between %T and %T\n",
687 		     DR_REF (dra), DR_REF (drb));
688 
689   return true;
690 }
691 
692 
693 /* Analyze dependences involved in the transform of SLP NODE.  STORES
694    contain the vector of scalar stores of this instance if we are
695    disambiguating the loads.  */
696 
697 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)698 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
699 				   vec<stmt_vec_info> stores,
700 				   stmt_vec_info last_store_info)
701 {
702   /* This walks over all stmts involved in the SLP load/store done
703      in NODE verifying we can sink them up to the last stmt in the
704      group.  */
705   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
706   vec_info *vinfo = last_access_info->vinfo;
707   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
708     {
709       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
710       if (access_info == last_access_info)
711 	continue;
712       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
713       ao_ref ref;
714       bool ref_initialized_p = false;
715       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
716 	   gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
717 	{
718 	  gimple *stmt = gsi_stmt (gsi);
719 	  if (! gimple_vuse (stmt)
720 	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
721 	    continue;
722 
723 	  /* If we couldn't record a (single) data reference for this
724 	     stmt we have to resort to the alias oracle.  */
725 	  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
726 	  data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
727 	  if (!dr_b)
728 	    {
729 	      /* We are moving a store or sinking a load - this means
730 	         we cannot use TBAA for disambiguation.  */
731 	      if (!ref_initialized_p)
732 		ao_ref_init (&ref, DR_REF (dr_a));
733 	      if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
734 		  || ref_maybe_used_by_stmt_p (stmt, &ref, false))
735 		return false;
736 	      continue;
737 	    }
738 
739 	  bool dependent = false;
740 	  /* If we run into a store of this same instance (we've just
741 	     marked those) then delay dependence checking until we run
742 	     into the last store because this is where it will have
743 	     been sunk to (and we verify if we can do that as well).  */
744 	  if (gimple_visited_p (stmt))
745 	    {
746 	      if (stmt_info != last_store_info)
747 		continue;
748 	      unsigned i;
749 	      stmt_vec_info store_info;
750 	      FOR_EACH_VEC_ELT (stores, i, store_info)
751 		{
752 		  data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
753 		  ddr_p ddr = initialize_data_dependence_relation
754 				(dr_a, store_dr, vNULL);
755 		  dependent
756 		    = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
757 		  free_dependence_relation (ddr);
758 		  if (dependent)
759 		    break;
760 		}
761 	    }
762 	  else
763 	    {
764 	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
765 							       dr_b, vNULL);
766 	      dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
767 	      free_dependence_relation (ddr);
768 	    }
769 	  if (dependent)
770 	    return false;
771 	}
772     }
773   return true;
774 }
775 
776 
777 /* Function vect_analyze_data_ref_dependences.
778 
779    Examine all the data references in the basic-block, and make sure there
780    do not exist any data dependences between them.  Set *MAX_VF according to
781    the maximum vectorization factor the data dependences allow.  */
782 
783 bool
vect_slp_analyze_instance_dependence(slp_instance instance)784 vect_slp_analyze_instance_dependence (slp_instance instance)
785 {
786   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
787 
788   /* The stores of this instance are at the root of the SLP tree.  */
789   slp_tree store = SLP_INSTANCE_TREE (instance);
790   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
791     store = NULL;
792 
793   /* Verify we can sink stores to the vectorized stmt insert location.  */
794   stmt_vec_info last_store_info = NULL;
795   if (store)
796     {
797       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
798 	return false;
799 
800       /* Mark stores in this instance and remember the last one.  */
801       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
802       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
803 	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
804     }
805 
806   bool res = true;
807 
808   /* Verify we can sink loads to the vectorized stmt insert location,
809      special-casing stores of this instance.  */
810   slp_tree load;
811   unsigned int i;
812   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
813     if (! vect_slp_analyze_node_dependences (instance, load,
814 					     store
815 					     ? SLP_TREE_SCALAR_STMTS (store)
816 					     : vNULL, last_store_info))
817       {
818 	res = false;
819 	break;
820       }
821 
822   /* Unset the visited flag.  */
823   if (store)
824     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
825       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
826 
827   return res;
828 }
829 
830 /* Record the base alignment guarantee given by DRB, which occurs
831    in STMT_INFO.  */
832 
833 static void
vect_record_base_alignment(stmt_vec_info stmt_info,innermost_loop_behavior * drb)834 vect_record_base_alignment (stmt_vec_info stmt_info,
835 			    innermost_loop_behavior *drb)
836 {
837   vec_info *vinfo = stmt_info->vinfo;
838   bool existed;
839   innermost_loop_behavior *&entry
840     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
841   if (!existed || entry->base_alignment < drb->base_alignment)
842     {
843       entry = drb;
844       if (dump_enabled_p ())
845 	dump_printf_loc (MSG_NOTE, vect_location,
846 			 "recording new base alignment for %T\n"
847 			 "  alignment:    %d\n"
848 			 "  misalignment: %d\n"
849 			 "  based on:     %G",
850 			 drb->base_address,
851 			 drb->base_alignment,
852 			 drb->base_misalignment,
853 			 stmt_info->stmt);
854     }
855 }
856 
857 /* If the region we're going to vectorize is reached, all unconditional
858    data references occur at least once.  We can therefore pool the base
859    alignment guarantees from each unconditional reference.  Do this by
860    going through all the data references in VINFO and checking whether
861    the containing statement makes the reference unconditionally.  If so,
862    record the alignment of the base address in VINFO so that it can be
863    used for all other references with the same base.  */
864 
865 void
vect_record_base_alignments(vec_info * vinfo)866 vect_record_base_alignments (vec_info *vinfo)
867 {
868   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
869   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
870   data_reference *dr;
871   unsigned int i;
872   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
873     {
874       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
875       stmt_vec_info stmt_info = dr_info->stmt;
876       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
877 	  && STMT_VINFO_VECTORIZABLE (stmt_info)
878 	  && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
879 	{
880 	  vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
881 
882 	  /* If DR is nested in the loop that is being vectorized, we can also
883 	     record the alignment of the base wrt the outer loop.  */
884 	  if (loop && nested_in_vect_loop_p (loop, stmt_info))
885 	    vect_record_base_alignment
886 	      (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
887 	}
888     }
889 }
890 
891 /* Return the target alignment for the vectorized form of DR_INFO.  */
892 
893 static poly_uint64
vect_calculate_target_alignment(dr_vec_info * dr_info)894 vect_calculate_target_alignment (dr_vec_info *dr_info)
895 {
896   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
897   return targetm.vectorize.preferred_vector_alignment (vectype);
898 }
899 
900 /* Function vect_compute_data_ref_alignment
901 
902    Compute the misalignment of the data reference DR_INFO.
903 
904    Output:
905    1. DR_MISALIGNMENT (DR_INFO) is defined.
906 
907    FOR NOW: No analysis is actually performed. Misalignment is calculated
908    only for trivial cases. TODO.  */
909 
910 static void
vect_compute_data_ref_alignment(dr_vec_info * dr_info)911 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
912 {
913   stmt_vec_info stmt_info = dr_info->stmt;
914   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
915   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
916   class loop *loop = NULL;
917   tree ref = DR_REF (dr_info->dr);
918   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
919 
920   if (dump_enabled_p ())
921     dump_printf_loc (MSG_NOTE, vect_location,
922                      "vect_compute_data_ref_alignment:\n");
923 
924   if (loop_vinfo)
925     loop = LOOP_VINFO_LOOP (loop_vinfo);
926 
927   /* Initialize misalignment to unknown.  */
928   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
929 
930   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
931     return;
932 
933   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
934   bool step_preserves_misalignment_p;
935 
936   poly_uint64 vector_alignment
937     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
938   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
939 
940   /* If the main loop has peeled for alignment we have no way of knowing
941      whether the data accesses in the epilogues are aligned.  We can't at
942      compile time answer the question whether we have entered the main loop or
943      not.  Fixes PR 92351.  */
944   if (loop_vinfo)
945     {
946       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
947       if (orig_loop_vinfo
948 	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
949 	return;
950     }
951 
952   unsigned HOST_WIDE_INT vect_align_c;
953   if (!vector_alignment.is_constant (&vect_align_c))
954     return;
955 
956   /* No step for BB vectorization.  */
957   if (!loop)
958     {
959       gcc_assert (integer_zerop (drb->step));
960       step_preserves_misalignment_p = true;
961     }
962 
963   /* In case the dataref is in an inner-loop of the loop that is being
964      vectorized (LOOP), we use the base and misalignment information
965      relative to the outer-loop (LOOP).  This is ok only if the misalignment
966      stays the same throughout the execution of the inner-loop, which is why
967      we have to check that the stride of the dataref in the inner-loop evenly
968      divides by the vector alignment.  */
969   else if (nested_in_vect_loop_p (loop, stmt_info))
970     {
971       step_preserves_misalignment_p
972 	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
973 
974       if (dump_enabled_p ())
975 	{
976 	  if (step_preserves_misalignment_p)
977 	    dump_printf_loc (MSG_NOTE, vect_location,
978 			     "inner step divides the vector alignment.\n");
979 	  else
980 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 			     "inner step doesn't divide the vector"
982 			     " alignment.\n");
983 	}
984     }
985 
986   /* Similarly we can only use base and misalignment information relative to
987      an innermost loop if the misalignment stays the same throughout the
988      execution of the loop.  As above, this is the case if the stride of
989      the dataref evenly divides by the alignment.  */
990   else
991     {
992       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993       step_preserves_misalignment_p
994 	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
995 
996       if (!step_preserves_misalignment_p && dump_enabled_p ())
997 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998 			 "step doesn't divide the vector alignment.\n");
999     }
1000 
1001   unsigned int base_alignment = drb->base_alignment;
1002   unsigned int base_misalignment = drb->base_misalignment;
1003 
1004   /* Calculate the maximum of the pooled base address alignment and the
1005      alignment that we can compute for DR itself.  */
1006   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007   if (entry && base_alignment < (*entry)->base_alignment)
1008     {
1009       base_alignment = (*entry)->base_alignment;
1010       base_misalignment = (*entry)->base_misalignment;
1011     }
1012 
1013   if (drb->offset_alignment < vect_align_c
1014       || !step_preserves_misalignment_p
1015       /* We need to know whether the step wrt the vectorized loop is
1016 	 negative when computing the starting misalignment below.  */
1017       || TREE_CODE (drb->step) != INTEGER_CST)
1018     {
1019       if (dump_enabled_p ())
1020 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 			 "Unknown alignment for access: %T\n", ref);
1022       return;
1023     }
1024 
1025   if (base_alignment < vect_align_c)
1026     {
1027       unsigned int max_alignment;
1028       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029       if (max_alignment < vect_align_c
1030 	  || !vect_can_force_dr_alignment_p (base,
1031 					     vect_align_c * BITS_PER_UNIT))
1032 	{
1033 	  if (dump_enabled_p ())
1034 	    dump_printf_loc (MSG_NOTE, vect_location,
1035 			     "can't force alignment of ref: %T\n", ref);
1036 	  return;
1037 	}
1038 
1039       /* Force the alignment of the decl.
1040 	 NOTE: This is the only change to the code we make during
1041 	 the analysis phase, before deciding to vectorize the loop.  */
1042       if (dump_enabled_p ())
1043 	dump_printf_loc (MSG_NOTE, vect_location,
1044 			 "force alignment of %T\n", ref);
1045 
1046       dr_info->base_decl = base;
1047       dr_info->base_misaligned = true;
1048       base_misalignment = 0;
1049     }
1050   poly_int64 misalignment
1051     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052 
1053   /* If this is a backward running DR then first access in the larger
1054      vectype actually is N-1 elements before the address in the DR.
1055      Adjust misalign accordingly.  */
1056   if (tree_int_cst_sgn (drb->step) < 0)
1057     /* PLUS because STEP is negative.  */
1058     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059 		     * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1060 
1061   unsigned int const_misalignment;
1062   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063     {
1064       if (dump_enabled_p ())
1065 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066 			 "Non-constant misalignment for access: %T\n", ref);
1067       return;
1068     }
1069 
1070   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071 
1072   if (dump_enabled_p ())
1073     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074 		     "misalign = %d bytes of ref %T\n",
1075 		     DR_MISALIGNMENT (dr_info), ref);
1076 
1077   return;
1078 }
1079 
1080 /* Function vect_update_misalignment_for_peel.
1081    Sets DR_INFO's misalignment
1082    - to 0 if it has the same alignment as DR_PEEL_INFO,
1083    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084    - to -1 (unknown) otherwise.
1085 
1086    DR_INFO - the data reference whose misalignment is to be adjusted.
1087    DR_PEEL_INFO - the data reference whose misalignment is being made
1088 		  zero in the vector loop by the peel.
1089    NPEEL - the number of iterations in the peel loop if the misalignment
1090            of DR_PEEL_INFO is known at compile time.  */
1091 
1092 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1093 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094 				   dr_vec_info *dr_peel_info, int npeel)
1095 {
1096   unsigned int i;
1097   vec<dr_p> same_aligned_drs;
1098   struct data_reference *current_dr;
1099   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100 
1101   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102      it is aligned in the vector loop.  */
1103   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105     {
1106       if (current_dr != dr_info->dr)
1107         continue;
1108       gcc_assert (!known_alignment_for_access_p (dr_info)
1109 		  || !known_alignment_for_access_p (dr_peel_info)
1110 		  || (DR_MISALIGNMENT (dr_info)
1111 		      == DR_MISALIGNMENT (dr_peel_info)));
1112       SET_DR_MISALIGNMENT (dr_info, 0);
1113       return;
1114     }
1115 
1116   unsigned HOST_WIDE_INT alignment;
1117   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118       && known_alignment_for_access_p (dr_info)
1119       && known_alignment_for_access_p (dr_peel_info))
1120     {
1121       int misal = DR_MISALIGNMENT (dr_info);
1122       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123       misal &= alignment - 1;
1124       SET_DR_MISALIGNMENT (dr_info, misal);
1125       return;
1126     }
1127 
1128   if (dump_enabled_p ())
1129     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130 		     "to unknown (-1).\n");
1131   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132 }
1133 
1134 
1135 /* Function verify_data_ref_alignment
1136 
1137    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1138 
1139 static opt_result
verify_data_ref_alignment(dr_vec_info * dr_info)1140 verify_data_ref_alignment (dr_vec_info *dr_info)
1141 {
1142   enum dr_alignment_support supportable_dr_alignment
1143     = vect_supportable_dr_alignment (dr_info, false);
1144   if (!supportable_dr_alignment)
1145     return opt_result::failure_at
1146       (dr_info->stmt->stmt,
1147        DR_IS_READ (dr_info->dr)
1148 	? "not vectorized: unsupported unaligned load: %T\n"
1149 	: "not vectorized: unsupported unaligned store: %T\n",
1150        DR_REF (dr_info->dr));
1151 
1152   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153     dump_printf_loc (MSG_NOTE, vect_location,
1154 		     "Vectorizing an unaligned access.\n");
1155 
1156   return opt_result::success ();
1157 }
1158 
1159 /* Function vect_verify_datarefs_alignment
1160 
1161    Return TRUE if all data references in the loop can be
1162    handled with respect to alignment.  */
1163 
1164 opt_result
vect_verify_datarefs_alignment(loop_vec_info vinfo)1165 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166 {
1167   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168   struct data_reference *dr;
1169   unsigned int i;
1170 
1171   FOR_EACH_VEC_ELT (datarefs, i, dr)
1172     {
1173       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174       stmt_vec_info stmt_info = dr_info->stmt;
1175 
1176       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177 	continue;
1178 
1179       /* For interleaving, only the alignment of the first access matters.   */
1180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182 	continue;
1183 
1184       /* Strided accesses perform only component accesses, alignment is
1185 	 irrelevant for them.  */
1186       if (STMT_VINFO_STRIDED_P (stmt_info)
1187 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188 	continue;
1189 
1190       opt_result res = verify_data_ref_alignment (dr_info);
1191       if (!res)
1192 	return res;
1193     }
1194 
1195   return opt_result::success ();
1196 }
1197 
1198 /* Given an memory reference EXP return whether its alignment is less
1199    than its size.  */
1200 
1201 static bool
not_size_aligned(tree exp)1202 not_size_aligned (tree exp)
1203 {
1204   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205     return true;
1206 
1207   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208 	  > get_object_alignment (exp));
1209 }
1210 
1211 /* Function vector_alignment_reachable_p
1212 
1213    Return true if vector alignment for DR_INFO is reachable by peeling
1214    a few loop iterations.  Return false otherwise.  */
1215 
1216 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221 
1222   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223     {
1224       /* For interleaved access we peel only if number of iterations in
1225 	 the prolog loop ({VF - misalignment}), is a multiple of the
1226 	 number of the interleaved accesses.  */
1227       int elem_size, mis_in_elements;
1228 
1229       /* FORNOW: handle only known alignment.  */
1230       if (!known_alignment_for_access_p (dr_info))
1231 	return false;
1232 
1233       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235       elem_size = vector_element_size (vector_size, nelements);
1236       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237 
1238       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239 	return false;
1240     }
1241 
1242   /* If misalignment is known at the compile time then allow peeling
1243      only if natural alignment is reachable through peeling.  */
1244   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245     {
1246       HOST_WIDE_INT elmsize =
1247 		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248       if (dump_enabled_p ())
1249 	{
1250 	  dump_printf_loc (MSG_NOTE, vect_location,
1251 	                   "data size = %wd. misalignment = %d.\n", elmsize,
1252 			   DR_MISALIGNMENT (dr_info));
1253 	}
1254       if (DR_MISALIGNMENT (dr_info) % elmsize)
1255 	{
1256 	  if (dump_enabled_p ())
1257 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258 	                     "data size does not divide the misalignment.\n");
1259 	  return false;
1260 	}
1261     }
1262 
1263   if (!known_alignment_for_access_p (dr_info))
1264     {
1265       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267       if (dump_enabled_p ())
1268 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 	                 "Unknown misalignment, %snaturally aligned\n",
1270 			 is_packed ? "not " : "");
1271       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272     }
1273 
1274   return true;
1275 }
1276 
1277 
1278 /* Calculate the cost of the memory access represented by DR_INFO.  */
1279 
1280 static void
vect_get_data_access_cost(dr_vec_info * dr_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282                            unsigned int *inside_cost,
1283                            unsigned int *outside_cost,
1284 			   stmt_vector_for_cost *body_cost_vec,
1285 			   stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287   stmt_vec_info stmt_info = dr_info->stmt;
1288   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289   int ncopies;
1290 
1291   if (PURE_SLP_STMT (stmt_info))
1292     ncopies = 1;
1293   else
1294     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295 
1296   if (DR_IS_READ (dr_info->dr))
1297     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298 			prologue_cost_vec, body_cost_vec, false);
1299   else
1300     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301 
1302   if (dump_enabled_p ())
1303     dump_printf_loc (MSG_NOTE, vect_location,
1304                      "vect_get_data_access_cost: inside_cost = %d, "
1305                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307 
1308 
1309 typedef struct _vect_peel_info
1310 {
1311   dr_vec_info *dr_info;
1312   int npeel;
1313   unsigned int count;
1314 } *vect_peel_info;
1315 
1316 typedef struct _vect_peel_extended_info
1317 {
1318   struct _vect_peel_info peel_info;
1319   unsigned int inside_cost;
1320   unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322 
1323 
1324 /* Peeling hashtable helpers.  */
1325 
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328   static inline hashval_t hash (const _vect_peel_info *);
1329   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331 
1332 inline hashval_t
hash(const _vect_peel_info * peel_info)1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335   return (hashval_t) peel_info->npeel;
1336 }
1337 
1338 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341   return (a->npeel == b->npeel);
1342 }
1343 
1344 
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1346 
1347 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel)1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349 			  loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350                           int npeel)
1351 {
1352   struct _vect_peel_info elem, *slot;
1353   _vect_peel_info **new_slot;
1354   bool supportable_dr_alignment
1355     = vect_supportable_dr_alignment (dr_info, true);
1356 
1357   elem.npeel = npeel;
1358   slot = peeling_htab->find (&elem);
1359   if (slot)
1360     slot->count++;
1361   else
1362     {
1363       slot = XNEW (struct _vect_peel_info);
1364       slot->npeel = npeel;
1365       slot->dr_info = dr_info;
1366       slot->count = 1;
1367       new_slot = peeling_htab->find_slot (slot, INSERT);
1368       *new_slot = slot;
1369     }
1370 
1371   if (!supportable_dr_alignment
1372       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373     slot->count += VECT_MAX_COST;
1374 }
1375 
1376 
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378    number of data accesses.  */
1379 
1380 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382 				     _vect_peel_extended_info *max)
1383 {
1384   vect_peel_info elem = *slot;
1385 
1386   if (elem->count > max->peel_info.count
1387       || (elem->count == max->peel_info.count
1388           && max->peel_info.npeel > elem->npeel))
1389     {
1390       max->peel_info.npeel = elem->npeel;
1391       max->peel_info.count = elem->count;
1392       max->peel_info.dr_info = elem->dr_info;
1393     }
1394 
1395   return 1;
1396 }
1397 
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1400    we assume DR0_INFO's misalignment will be zero after peeling.  */
1401 
1402 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel,bool unknown_misalignment)1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404 				dr_vec_info *dr0_info,
1405 				unsigned int *inside_cost,
1406 				unsigned int *outside_cost,
1407 				stmt_vector_for_cost *body_cost_vec,
1408 				stmt_vector_for_cost *prologue_cost_vec,
1409 				unsigned int npeel,
1410 				bool unknown_misalignment)
1411 {
1412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413   unsigned i;
1414   data_reference *dr;
1415 
1416   FOR_EACH_VEC_ELT (datarefs, i, dr)
1417     {
1418       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419       stmt_vec_info stmt_info = dr_info->stmt;
1420       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421 	continue;
1422 
1423       /* For interleaving, only the alignment of the first access
1424          matters.  */
1425       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427 	continue;
1428 
1429       /* Strided accesses perform only component accesses, alignment is
1430          irrelevant for them.  */
1431       if (STMT_VINFO_STRIDED_P (stmt_info)
1432 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433 	continue;
1434 
1435       int save_misalignment;
1436       save_misalignment = DR_MISALIGNMENT (dr_info);
1437       if (npeel == 0)
1438 	;
1439       else if (unknown_misalignment && dr_info == dr0_info)
1440 	SET_DR_MISALIGNMENT (dr_info, 0);
1441       else
1442 	vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444 				 body_cost_vec, prologue_cost_vec);
1445       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446     }
1447 }
1448 
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450    Find the one with the lowest cost.  */
1451 
1452 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454 				   _vect_peel_extended_info *min)
1455 {
1456   vect_peel_info elem = *slot;
1457   int dummy;
1458   unsigned int inside_cost = 0, outside_cost = 0;
1459   stmt_vec_info stmt_info = elem->dr_info->stmt;
1460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462 		       epilogue_cost_vec;
1463 
1464   prologue_cost_vec.create (2);
1465   body_cost_vec.create (2);
1466   epilogue_cost_vec.create (2);
1467 
1468   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469 				  &outside_cost, &body_cost_vec,
1470 				  &prologue_cost_vec, elem->npeel, false);
1471 
1472   body_cost_vec.release ();
1473 
1474   outside_cost += vect_get_known_peeling_cost
1475     (loop_vinfo, elem->npeel, &dummy,
1476      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477      &prologue_cost_vec, &epilogue_cost_vec);
1478 
1479   /* Prologue and epilogue costs are added to the target model later.
1480      These costs depend only on the scalar iteration cost, the
1481      number of peeling iterations finally chosen, and the number of
1482      misaligned statements.  So discard the information found here.  */
1483   prologue_cost_vec.release ();
1484   epilogue_cost_vec.release ();
1485 
1486   if (inside_cost < min->inside_cost
1487       || (inside_cost == min->inside_cost
1488 	  && outside_cost < min->outside_cost))
1489     {
1490       min->inside_cost = inside_cost;
1491       min->outside_cost = outside_cost;
1492       min->peel_info.dr_info = elem->dr_info;
1493       min->peel_info.npeel = elem->npeel;
1494       min->peel_info.count = elem->count;
1495     }
1496 
1497   return 1;
1498 }
1499 
1500 
1501 /* Choose best peeling option by traversing peeling hash table and either
1502    choosing an option with the lowest cost (if cost model is enabled) or the
1503    option that aligns as many accesses as possible.  */
1504 
1505 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507 				       loop_vec_info loop_vinfo)
1508 {
1509    struct _vect_peel_extended_info res;
1510 
1511    res.peel_info.dr_info = NULL;
1512 
1513    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514      {
1515        res.inside_cost = INT_MAX;
1516        res.outside_cost = INT_MAX;
1517        peeling_htab->traverse <_vect_peel_extended_info *,
1518 	   		       vect_peeling_hash_get_lowest_cost> (&res);
1519      }
1520    else
1521      {
1522        res.peel_info.count = 0;
1523        peeling_htab->traverse <_vect_peel_extended_info *,
1524 	   		       vect_peeling_hash_get_most_frequent> (&res);
1525        res.inside_cost = 0;
1526        res.outside_cost = 0;
1527      }
1528 
1529    return res;
1530 }
1531 
1532 /* Return true if the new peeling NPEEL is supported.  */
1533 
1534 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536 			  unsigned npeel)
1537 {
1538   unsigned i;
1539   struct data_reference *dr = NULL;
1540   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541   enum dr_alignment_support supportable_dr_alignment;
1542 
1543   /* Ensure that all data refs can be vectorized after the peel.  */
1544   FOR_EACH_VEC_ELT (datarefs, i, dr)
1545     {
1546       int save_misalignment;
1547 
1548       if (dr == dr0_info->dr)
1549 	continue;
1550 
1551       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552       stmt_vec_info stmt_info = dr_info->stmt;
1553       /* For interleaving, only the alignment of the first access
1554 	 matters.  */
1555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557 	continue;
1558 
1559       /* Strided accesses perform only component accesses, alignment is
1560 	 irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563 	continue;
1564 
1565       save_misalignment = DR_MISALIGNMENT (dr_info);
1566       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567       supportable_dr_alignment
1568 	= vect_supportable_dr_alignment (dr_info, false);
1569       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570 
1571       if (!supportable_dr_alignment)
1572 	return false;
1573     }
1574 
1575   return true;
1576 }
1577 
1578 /* Function vect_enhance_data_refs_alignment
1579 
1580    This pass will use loop versioning and loop peeling in order to enhance
1581    the alignment of data references in the loop.
1582 
1583    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584    original loop is to be vectorized.  Any other loops that are created by
1585    the transformations performed in this pass - are not supposed to be
1586    vectorized.  This restriction will be relaxed.
1587 
1588    This pass will require a cost model to guide it whether to apply peeling
1589    or versioning or a combination of the two.  For example, the scheme that
1590    intel uses when given a loop with several memory accesses, is as follows:
1591    choose one memory access ('p') which alignment you want to force by doing
1592    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593    other accesses are not necessarily aligned, or (2) use loop versioning to
1594    generate one loop in which all accesses are aligned, and another loop in
1595    which only 'p' is necessarily aligned.
1596 
1597    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600 
1601    Devising a cost model is the most critical aspect of this work.  It will
1602    guide us on which access to peel for, whether to use loop versioning, how
1603    many versions to create, etc.  The cost model will probably consist of
1604    generic considerations as well as target specific considerations (on
1605    powerpc for example, misaligned stores are more painful than misaligned
1606    loads).
1607 
1608    Here are the general steps involved in alignment enhancements:
1609 
1610      -- original loop, before alignment analysis:
1611 	for (i=0; i<N; i++){
1612 	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1613 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1614 	}
1615 
1616      -- After vect_compute_data_refs_alignment:
1617 	for (i=0; i<N; i++){
1618 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1619 	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1620 	}
1621 
1622      -- Possibility 1: we do loop versioning:
1623      if (p is aligned) {
1624 	for (i=0; i<N; i++){	# loop 1A
1625 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1626 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1627 	}
1628      }
1629      else {
1630 	for (i=0; i<N; i++){	# loop 1B
1631 	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1632 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1633 	}
1634      }
1635 
1636      -- Possibility 2: we do loop peeling:
1637      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1638 	x = q[i];
1639 	p[i] = y;
1640      }
1641      for (i = 3; i < N; i++){	# loop 2A
1642 	x = q[i];			# DR_MISALIGNMENT(q) = 0
1643 	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1644      }
1645 
1646      -- Possibility 3: combination of loop peeling and versioning:
1647      for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1648 	x = q[i];
1649 	p[i] = y;
1650      }
1651      if (p is aligned) {
1652 	for (i = 3; i<N; i++){	# loop 3A
1653 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1654 	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1655 	}
1656      }
1657      else {
1658 	for (i = 3; i<N; i++){	# loop 3B
1659 	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1660 	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1661 	}
1662      }
1663 
1664      These loops are later passed to loop_transform to be vectorized.  The
1665      vectorizer will use the alignment information to guide the transformation
1666      (whether to generate regular loads/stores, or with special handling for
1667      misalignment).  */
1668 
1669 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675   dr_vec_info *first_store = NULL;
1676   dr_vec_info *dr0_info = NULL;
1677   struct data_reference *dr;
1678   unsigned int i, j;
1679   bool do_peeling = false;
1680   bool do_versioning = false;
1681   unsigned int npeel = 0;
1682   bool one_misalignment_known = false;
1683   bool one_misalignment_unknown = false;
1684   bool one_dr_unsupportable = false;
1685   dr_vec_info *unsupportable_dr_info = NULL;
1686   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687   unsigned possible_npeel_number = 1;
1688   tree vectype;
1689   unsigned int mis, same_align_drs_max = 0;
1690   hash_table<peel_info_hasher> peeling_htab (1);
1691 
1692   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1693 
1694   /* Reset data so we can safely be called multiple times.  */
1695   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1697 
1698   /* While cost model enhancements are expected in the future, the high level
1699      view of the code at this time is as follows:
1700 
1701      A) If there is a misaligned access then see if peeling to align
1702         this access can make all data references satisfy
1703         vect_supportable_dr_alignment.  If so, update data structures
1704         as needed and return true.
1705 
1706      B) If peeling wasn't possible and there is a data reference with an
1707         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708         then see if loop versioning checks can be used to make all data
1709         references satisfy vect_supportable_dr_alignment.  If so, update
1710         data structures as needed and return true.
1711 
1712      C) If neither peeling nor versioning were successful then return false if
1713         any data reference does not satisfy vect_supportable_dr_alignment.
1714 
1715      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1716 
1717      Note, Possibility 3 above (which is peeling and versioning together) is not
1718      being done at this time.  */
1719 
1720   /* (1) Peeling to force alignment.  */
1721 
1722   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723      Considerations:
1724      + How many accesses will become aligned due to the peeling
1725      - How many accesses will become unaligned due to the peeling,
1726        and the cost of misaligned accesses.
1727      - The cost of peeling (the extra runtime checks, the increase
1728        in code size).  */
1729 
1730   FOR_EACH_VEC_ELT (datarefs, i, dr)
1731     {
1732       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733       stmt_vec_info stmt_info = dr_info->stmt;
1734 
1735       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736 	continue;
1737 
1738       /* For interleaving, only the alignment of the first access
1739          matters.  */
1740       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741 	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742 	continue;
1743 
1744       /* For scatter-gather or invariant accesses there is nothing
1745 	 to enhance.  */
1746       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747 	  || integer_zerop (DR_STEP (dr)))
1748 	continue;
1749 
1750       /* Strided accesses perform only component accesses, alignment is
1751 	 irrelevant for them.  */
1752       if (STMT_VINFO_STRIDED_P (stmt_info)
1753 	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754 	continue;
1755 
1756       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757       do_peeling = vector_alignment_reachable_p (dr_info);
1758       if (do_peeling)
1759         {
1760           if (known_alignment_for_access_p (dr_info))
1761             {
1762 	      unsigned int npeel_tmp = 0;
1763 	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1764 						    size_zero_node) < 0;
1765 
1766 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1767 	      /* If known_alignment_for_access_p then we have set
1768 	         DR_MISALIGNMENT which is only done if we know it at compiler
1769 	         time, so it is safe to assume target alignment is constant.
1770 	       */
1771 	      unsigned int target_align =
1772 		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773 	      unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774 	      mis = (negative
1775 		     ? DR_MISALIGNMENT (dr_info)
1776 		     : -DR_MISALIGNMENT (dr_info));
1777 	      if (DR_MISALIGNMENT (dr_info) != 0)
1778 		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779 
1780               /* For multiple types, it is possible that the bigger type access
1781                  will have more than one peeling option.  E.g., a loop with two
1782                  types: one of size (vector size / 4), and the other one of
1783                  size (vector size / 8).  Vectorization factor will 8.  If both
1784                  accesses are misaligned by 3, the first one needs one scalar
1785                  iteration to be aligned, and the second one needs 5.  But the
1786 		 first one will be aligned also by peeling 5 scalar
1787                  iterations, and in that case both accesses will be aligned.
1788                  Hence, except for the immediate peeling amount, we also want
1789                  to try to add full vector size, while we don't exceed
1790                  vectorization factor.
1791                  We do this automatically for cost model, since we calculate
1792 		 cost for every peeling option.  */
1793               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794 		{
1795 		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796 					  ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797 		  possible_npeel_number
1798 		    = vect_get_num_vectors (nscalars, vectype);
1799 
1800 		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1801 		     allow peeling NELEMENTS.  */
1802 		  if (DR_MISALIGNMENT (dr_info) == 0)
1803 		    possible_npeel_number++;
1804 		}
1805 
1806 	      /* Save info about DR in the hash table.  Also include peeling
1807 	         amounts according to the explanation above.  */
1808               for (j = 0; j < possible_npeel_number; j++)
1809                 {
1810                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811 					    dr_info, npeel_tmp);
1812 		  npeel_tmp += target_align / dr_size;
1813                 }
1814 
1815 	      one_misalignment_known = true;
1816             }
1817           else
1818             {
1819               /* If we don't know any misalignment values, we prefer
1820                  peeling for data-ref that has the maximum number of data-refs
1821                  with the same alignment, unless the target prefers to align
1822                  stores over load.  */
1823 	      unsigned same_align_drs
1824 		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825 	      if (!dr0_info
1826 		  || same_align_drs_max < same_align_drs)
1827 		{
1828 		  same_align_drs_max = same_align_drs;
1829 		  dr0_info = dr_info;
1830 		}
1831 	      /* For data-refs with the same number of related
1832 		 accesses prefer the one where the misalign
1833 		 computation will be invariant in the outermost loop.  */
1834 	      else if (same_align_drs_max == same_align_drs)
1835 		{
1836 		  class loop *ivloop0, *ivloop;
1837 		  ivloop0 = outermost_invariant_loop_for_expr
1838 		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839 		  ivloop = outermost_invariant_loop_for_expr
1840 		    (loop, DR_BASE_ADDRESS (dr));
1841 		  if ((ivloop && !ivloop0)
1842 		      || (ivloop && ivloop0
1843 			  && flow_loop_nested_p (ivloop, ivloop0)))
1844 		    dr0_info = dr_info;
1845 		}
1846 
1847 	      one_misalignment_unknown = true;
1848 
1849 	      /* Check for data refs with unsupportable alignment that
1850 	         can be peeled.  */
1851 	      if (!supportable_dr_alignment)
1852 	      {
1853 		one_dr_unsupportable = true;
1854 		unsupportable_dr_info = dr_info;
1855 	      }
1856 
1857 	      if (!first_store && DR_IS_WRITE (dr))
1858 		first_store = dr_info;
1859             }
1860         }
1861       else
1862         {
1863           if (!aligned_access_p (dr_info))
1864             {
1865               if (dump_enabled_p ())
1866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                  "vector alignment may not be reachable\n");
1868               break;
1869             }
1870         }
1871     }
1872 
1873   /* Check if we can possibly peel the loop.  */
1874   if (!vect_can_advance_ivs_p (loop_vinfo)
1875       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876       || loop->inner)
1877     do_peeling = false;
1878 
1879   struct _vect_peel_extended_info peel_for_known_alignment;
1880   struct _vect_peel_extended_info peel_for_unknown_alignment;
1881   struct _vect_peel_extended_info best_peel;
1882 
1883   peel_for_unknown_alignment.inside_cost = INT_MAX;
1884   peel_for_unknown_alignment.outside_cost = INT_MAX;
1885   peel_for_unknown_alignment.peel_info.count = 0;
1886 
1887   if (do_peeling
1888       && one_misalignment_unknown)
1889     {
1890       /* Check if the target requires to prefer stores over loads, i.e., if
1891          misaligned stores are more expensive than misaligned loads (taking
1892          drs with same alignment into account).  */
1893       unsigned int load_inside_cost = 0;
1894       unsigned int load_outside_cost = 0;
1895       unsigned int store_inside_cost = 0;
1896       unsigned int store_outside_cost = 0;
1897       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898 
1899       stmt_vector_for_cost dummy;
1900       dummy.create (2);
1901       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902 				      &load_inside_cost,
1903 				      &load_outside_cost,
1904 				      &dummy, &dummy, estimated_npeels, true);
1905       dummy.release ();
1906 
1907       if (first_store)
1908 	{
1909 	  dummy.create (2);
1910 	  vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911 					  &store_inside_cost,
1912 					  &store_outside_cost,
1913 					  &dummy, &dummy,
1914 					  estimated_npeels, true);
1915 	  dummy.release ();
1916 	}
1917       else
1918 	{
1919 	  store_inside_cost = INT_MAX;
1920 	  store_outside_cost = INT_MAX;
1921 	}
1922 
1923       if (load_inside_cost > store_inside_cost
1924 	  || (load_inside_cost == store_inside_cost
1925 	      && load_outside_cost > store_outside_cost))
1926 	{
1927 	  dr0_info = first_store;
1928 	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929 	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930 	}
1931       else
1932 	{
1933 	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934 	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935 	}
1936 
1937       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938       prologue_cost_vec.create (2);
1939       epilogue_cost_vec.create (2);
1940 
1941       int dummy2;
1942       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943 	(loop_vinfo, estimated_npeels, &dummy2,
1944 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945 	 &prologue_cost_vec, &epilogue_cost_vec);
1946 
1947       prologue_cost_vec.release ();
1948       epilogue_cost_vec.release ();
1949 
1950       peel_for_unknown_alignment.peel_info.count = 1
1951 	+ STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952     }
1953 
1954   peel_for_unknown_alignment.peel_info.npeel = 0;
1955   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956 
1957   best_peel = peel_for_unknown_alignment;
1958 
1959   peel_for_known_alignment.inside_cost = INT_MAX;
1960   peel_for_known_alignment.outside_cost = INT_MAX;
1961   peel_for_known_alignment.peel_info.count = 0;
1962   peel_for_known_alignment.peel_info.dr_info = NULL;
1963 
1964   if (do_peeling && one_misalignment_known)
1965     {
1966       /* Peeling is possible, but there is no data access that is not supported
1967          unless aligned.  So we try to choose the best possible peeling from
1968 	 the hash table.  */
1969       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970 	(&peeling_htab, loop_vinfo);
1971     }
1972 
1973   /* Compare costs of peeling for known and unknown alignment. */
1974   if (peel_for_known_alignment.peel_info.dr_info != NULL
1975       && peel_for_unknown_alignment.inside_cost
1976       >= peel_for_known_alignment.inside_cost)
1977     {
1978       best_peel = peel_for_known_alignment;
1979 
1980       /* If the best peeling for known alignment has NPEEL == 0, perform no
1981          peeling at all except if there is an unsupportable dr that we can
1982          align.  */
1983       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984 	do_peeling = false;
1985     }
1986 
1987   /* If there is an unsupportable data ref, prefer this over all choices so far
1988      since we'd have to discard a chosen peeling except when it accidentally
1989      aligned the unsupportable data ref.  */
1990   if (one_dr_unsupportable)
1991     dr0_info = unsupportable_dr_info;
1992   else if (do_peeling)
1993     {
1994       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995 	 TODO: Use nopeel_outside_cost or get rid of it?  */
1996       unsigned nopeel_inside_cost = 0;
1997       unsigned nopeel_outside_cost = 0;
1998 
1999       stmt_vector_for_cost dummy;
2000       dummy.create (2);
2001       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002 				      &nopeel_outside_cost, &dummy, &dummy,
2003 				      0, false);
2004       dummy.release ();
2005 
2006       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007 	 costs will be recorded.  */
2008       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009       prologue_cost_vec.create (2);
2010       epilogue_cost_vec.create (2);
2011 
2012       int dummy2;
2013       nopeel_outside_cost += vect_get_known_peeling_cost
2014 	(loop_vinfo, 0, &dummy2,
2015 	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016 	 &prologue_cost_vec, &epilogue_cost_vec);
2017 
2018       prologue_cost_vec.release ();
2019       epilogue_cost_vec.release ();
2020 
2021       npeel = best_peel.peel_info.npeel;
2022       dr0_info = best_peel.peel_info.dr_info;
2023 
2024       /* If no peeling is not more expensive than the best peeling we
2025 	 have so far, don't perform any peeling.  */
2026       if (nopeel_inside_cost <= best_peel.inside_cost)
2027 	do_peeling = false;
2028     }
2029 
2030   if (do_peeling)
2031     {
2032       stmt_vec_info stmt_info = dr0_info->stmt;
2033       vectype = STMT_VINFO_VECTYPE (stmt_info);
2034 
2035       if (known_alignment_for_access_p (dr0_info))
2036         {
2037 	  bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038 						size_zero_node) < 0;
2039           if (!npeel)
2040             {
2041               /* Since it's known at compile time, compute the number of
2042                  iterations in the peeled loop (the peeling factor) for use in
2043                  updating DR_MISALIGNMENT values.  The peeling factor is the
2044                  vectorization factor minus the misalignment as an element
2045                  count.  */
2046 	      mis = (negative
2047 		     ? DR_MISALIGNMENT (dr0_info)
2048 		     : -DR_MISALIGNMENT (dr0_info));
2049 	      /* If known_alignment_for_access_p then we have set
2050 	         DR_MISALIGNMENT which is only done if we know it at compiler
2051 	         time, so it is safe to assume target alignment is constant.
2052 	       */
2053 	      unsigned int target_align =
2054 		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055 	      npeel = ((mis & (target_align - 1))
2056 		       / vect_get_scalar_dr_size (dr0_info));
2057             }
2058 
2059 	  /* For interleaved data access every iteration accesses all the
2060 	     members of the group, therefore we divide the number of iterations
2061 	     by the group size.  */
2062 	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063 	    npeel /= DR_GROUP_SIZE (stmt_info);
2064 
2065           if (dump_enabled_p ())
2066             dump_printf_loc (MSG_NOTE, vect_location,
2067                              "Try peeling by %d\n", npeel);
2068         }
2069 
2070       /* Ensure that all datarefs can be vectorized after the peel.  */
2071       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072 	do_peeling = false;
2073 
2074       /* Check if all datarefs are supportable and log.  */
2075       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076         {
2077           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078           if (!stat)
2079             do_peeling = false;
2080           else
2081 	    return stat;
2082         }
2083 
2084       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2085       if (do_peeling)
2086         {
2087           unsigned max_allowed_peel
2088 	    = param_vect_max_peeling_for_alignment;
2089 	  if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090 	    max_allowed_peel = 0;
2091           if (max_allowed_peel != (unsigned)-1)
2092             {
2093               unsigned max_peel = npeel;
2094               if (max_peel == 0)
2095                 {
2096 		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097 		  unsigned HOST_WIDE_INT target_align_c;
2098 		  if (target_align.is_constant (&target_align_c))
2099 		    max_peel =
2100 		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101 		  else
2102 		    {
2103 		      do_peeling = false;
2104 		      if (dump_enabled_p ())
2105 			dump_printf_loc (MSG_NOTE, vect_location,
2106 			  "Disable peeling, max peels set and vector"
2107 			  " alignment unknown\n");
2108 		    }
2109                 }
2110               if (max_peel > max_allowed_peel)
2111                 {
2112                   do_peeling = false;
2113                   if (dump_enabled_p ())
2114                     dump_printf_loc (MSG_NOTE, vect_location,
2115                         "Disable peeling, max peels reached: %d\n", max_peel);
2116                 }
2117             }
2118         }
2119 
2120       /* Cost model #2 - if peeling may result in a remaining loop not
2121 	 iterating enough to be vectorized then do not peel.  Since this
2122 	 is a cost heuristic rather than a correctness decision, use the
2123 	 most likely runtime value for variable vectorization factors.  */
2124       if (do_peeling
2125 	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126 	{
2127 	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128 	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129 	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130 	      < assumed_vf + max_peel)
2131 	    do_peeling = false;
2132 	}
2133 
2134       if (do_peeling)
2135         {
2136           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137              If the misalignment of DR_i is identical to that of dr0 then set
2138              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2139              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140              by the peeling factor times the element size of DR_i (MOD the
2141              vectorization factor times the size).  Otherwise, the
2142              misalignment of DR_i must be set to unknown.  */
2143 	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2144 	    if (dr != dr0_info->dr)
2145 	      {
2146 		/* Strided accesses perform only component accesses, alignment
2147 		   is irrelevant for them.  */
2148 		dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149 		stmt_info = dr_info->stmt;
2150 		if (STMT_VINFO_STRIDED_P (stmt_info)
2151 		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152 		  continue;
2153 
2154 		vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155 	      }
2156 
2157           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158           if (npeel)
2159             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160           else
2161             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162 	      = DR_MISALIGNMENT (dr0_info);
2163 	  SET_DR_MISALIGNMENT (dr0_info, 0);
2164 	  if (dump_enabled_p ())
2165             {
2166               dump_printf_loc (MSG_NOTE, vect_location,
2167                                "Alignment of access forced using peeling.\n");
2168               dump_printf_loc (MSG_NOTE, vect_location,
2169                                "Peeling for alignment will be applied.\n");
2170             }
2171 
2172 	  /* The inside-loop cost will be accounted for in vectorizable_load
2173 	     and vectorizable_store correctly with adjusted alignments.
2174 	     Drop the body_cst_vec on the floor here.  */
2175 	  opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176 	  gcc_assert (stat);
2177           return stat;
2178         }
2179     }
2180 
2181   /* (2) Versioning to force alignment.  */
2182 
2183   /* Try versioning if:
2184      1) optimize loop for speed and the cost-model is not cheap
2185      2) there is at least one unsupported misaligned data ref with an unknown
2186         misalignment, and
2187      3) all misaligned data refs with a known misalignment are supported, and
2188      4) the number of runtime alignment checks is within reason.  */
2189 
2190   do_versioning
2191     = (optimize_loop_nest_for_speed_p (loop)
2192        && !loop->inner /* FORNOW */
2193        && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194 
2195   if (do_versioning)
2196     {
2197       FOR_EACH_VEC_ELT (datarefs, i, dr)
2198         {
2199 	  dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200 	  stmt_vec_info stmt_info = dr_info->stmt;
2201 
2202 	  /* For interleaving, only the alignment of the first access
2203 	     matters.  */
2204 	  if (aligned_access_p (dr_info)
2205 	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206 		  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207 	    continue;
2208 
2209 	  if (STMT_VINFO_STRIDED_P (stmt_info))
2210 	    {
2211 	      /* Strided loads perform only component accesses, alignment is
2212 		 irrelevant for them.  */
2213 	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214 		continue;
2215 	      do_versioning = false;
2216 	      break;
2217 	    }
2218 
2219 	  supportable_dr_alignment
2220 	    = vect_supportable_dr_alignment (dr_info, false);
2221 
2222           if (!supportable_dr_alignment)
2223             {
2224               int mask;
2225               tree vectype;
2226 
2227               if (known_alignment_for_access_p (dr_info)
2228                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229 		  >= (unsigned) param_vect_max_version_for_alignment_checks)
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234 
2235 	      vectype = STMT_VINFO_VECTYPE (stmt_info);
2236 	      gcc_assert (vectype);
2237 
2238 	      /* At present we don't support versioning for alignment
2239 		 with variable VF, since there's no guarantee that the
2240 		 VF is a power of two.  We could relax this if we added
2241 		 a way of enforcing a power-of-two size.  */
2242 	      unsigned HOST_WIDE_INT size;
2243 	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244 		{
2245 		  do_versioning = false;
2246 		  break;
2247 		}
2248 
2249 	      /* Forcing alignment in the first iteration is no good if
2250 		 we don't keep it across iterations.  For now, just disable
2251 		 versioning in this case.
2252 		 ?? We could actually unroll the loop to achieve the required
2253 		 overall step alignment, and forcing the alignment could be
2254 		 done by doing some iterations of the non-vectorized loop.  */
2255 	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256 			       * DR_STEP_ALIGNMENT (dr),
2257 			       DR_TARGET_ALIGNMENT (dr_info)))
2258 		{
2259 		  do_versioning = false;
2260 		  break;
2261 		}
2262 
2263               /* The rightmost bits of an aligned address must be zeros.
2264                  Construct the mask needed for this test.  For example,
2265                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266                  mask must be 15 = 0xf. */
2267 	      mask = size - 1;
2268 
2269 	      /* FORNOW: use the same mask to test all potentially unaligned
2270 		 references in the loop.  */
2271 	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272 		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2273 		{
2274 		  do_versioning = false;
2275 		  break;
2276 		}
2277 
2278               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279 	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2280             }
2281         }
2282 
2283       /* Versioning requires at least one misaligned data reference.  */
2284       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285         do_versioning = false;
2286       else if (!do_versioning)
2287         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2288     }
2289 
2290   if (do_versioning)
2291     {
2292       vec<stmt_vec_info> may_misalign_stmts
2293         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294       stmt_vec_info stmt_info;
2295 
2296       /* It can now be assumed that the data references in the statements
2297          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298          of the loop being vectorized.  */
2299       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2300         {
2301 	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302 	  SET_DR_MISALIGNMENT (dr_info, 0);
2303 	  if (dump_enabled_p ())
2304             dump_printf_loc (MSG_NOTE, vect_location,
2305                              "Alignment of access forced using versioning.\n");
2306         }
2307 
2308       if (dump_enabled_p ())
2309         dump_printf_loc (MSG_NOTE, vect_location,
2310                          "Versioning for alignment will be applied.\n");
2311 
2312       /* Peeling and versioning can't be done together at this time.  */
2313       gcc_assert (! (do_peeling && do_versioning));
2314 
2315       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316       gcc_assert (stat);
2317       return stat;
2318     }
2319 
2320   /* This point is reached if neither peeling nor versioning is being done.  */
2321   gcc_assert (! (do_peeling || do_versioning));
2322 
2323   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324   return stat;
2325 }
2326 
2327 
2328 /* Function vect_find_same_alignment_drs.
2329 
2330    Update group and alignment relations in VINFO according to the chosen
2331    vectorization factor.  */
2332 
2333 static void
vect_find_same_alignment_drs(vec_info * vinfo,data_dependence_relation * ddr)2334 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2335 {
2336   struct data_reference *dra = DDR_A (ddr);
2337   struct data_reference *drb = DDR_B (ddr);
2338   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2342 
2343   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344     return;
2345 
2346   if (dra == drb)
2347     return;
2348 
2349   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351     return;
2352 
2353   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356     return;
2357 
2358   /* Two references with distance zero have the same alignment.  */
2359   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360 			  - wi::to_poly_offset (DR_INIT (drb)));
2361   if (maybe_ne (diff, 0))
2362     {
2363       /* Get the wider of the two alignments.  */
2364       poly_uint64 align_a =
2365 	exact_div (vect_calculate_target_alignment (dr_info_a),
2366 		   BITS_PER_UNIT);
2367       poly_uint64 align_b =
2368 	exact_div (vect_calculate_target_alignment (dr_info_b),
2369 		   BITS_PER_UNIT);
2370       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371       if (!align_a.is_constant (&align_a_c)
2372 	  || !align_b.is_constant (&align_b_c))
2373 	return;
2374 
2375       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2376 
2377       /* Require the gap to be a multiple of the larger vector alignment.  */
2378       if (!multiple_p (diff, max_align))
2379 	return;
2380     }
2381 
2382   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384   if (dump_enabled_p ())
2385     dump_printf_loc (MSG_NOTE, vect_location,
2386 		     "accesses have the same alignment: %T and %T\n",
2387 		     DR_REF (dra), DR_REF (drb));
2388 }
2389 
2390 
2391 /* Function vect_analyze_data_refs_alignment
2392 
2393    Analyze the alignment of the data-references in the loop.
2394    Return FALSE if a data reference is found that cannot be vectorized.  */
2395 
2396 opt_result
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2397 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2398 {
2399   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2400 
2401   /* Mark groups of data references with same alignment using
2402      data dependence information.  */
2403   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404   struct data_dependence_relation *ddr;
2405   unsigned int i;
2406 
2407   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408     vect_find_same_alignment_drs (vinfo, ddr);
2409 
2410   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411   struct data_reference *dr;
2412 
2413   vect_record_base_alignments (vinfo);
2414   FOR_EACH_VEC_ELT (datarefs, i, dr)
2415     {
2416       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418 	vect_compute_data_ref_alignment (dr_info);
2419     }
2420 
2421   return opt_result::success ();
2422 }
2423 
2424 
2425 /* Analyze alignment of DRs of stmts in NODE.  */
2426 
2427 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2428 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2429 {
2430   /* We vectorize from the first scalar stmt in the node unless
2431      the node is permuted in which case we start from the first
2432      element in the group.  */
2433   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2437 
2438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439   vect_compute_data_ref_alignment (dr_info);
2440   /* For creating the data-ref pointer we need alignment of the
2441      first element anyway.  */
2442   if (dr_info != first_dr_info)
2443     vect_compute_data_ref_alignment (first_dr_info);
2444   if (! verify_data_ref_alignment (dr_info))
2445     {
2446       if (dump_enabled_p ())
2447 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448 			 "not vectorized: bad data alignment in basic "
2449 			 "block.\n");
2450       return false;
2451     }
2452 
2453   return true;
2454 }
2455 
2456 /* Function vect_slp_analyze_instance_alignment
2457 
2458    Analyze the alignment of the data-references in the SLP instance.
2459    Return FALSE if a data reference is found that cannot be vectorized.  */
2460 
2461 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2462 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2463 {
2464   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2465 
2466   slp_tree node;
2467   unsigned i;
2468   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469     if (! vect_slp_analyze_and_verify_node_alignment (node))
2470       return false;
2471 
2472   node = SLP_INSTANCE_TREE (instance);
2473   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2474       && ! vect_slp_analyze_and_verify_node_alignment
2475 	     (SLP_INSTANCE_TREE (instance)))
2476     return false;
2477 
2478   return true;
2479 }
2480 
2481 
2482 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2483    accesses of legal size, step, etc.  Detect gaps, single element
2484    interleaving, and other special cases. Set grouped access info.
2485    Collect groups of strided stores for further use in SLP analysis.
2486    Worker for vect_analyze_group_access.  */
2487 
2488 static bool
vect_analyze_group_access_1(dr_vec_info * dr_info)2489 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2490 {
2491   data_reference *dr = dr_info->dr;
2492   tree step = DR_STEP (dr);
2493   tree scalar_type = TREE_TYPE (DR_REF (dr));
2494   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2495   stmt_vec_info stmt_info = dr_info->stmt;
2496   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2497   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2498   HOST_WIDE_INT dr_step = -1;
2499   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2500   bool slp_impossible = false;
2501 
2502   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2503      size of the interleaving group (including gaps).  */
2504   if (tree_fits_shwi_p (step))
2505     {
2506       dr_step = tree_to_shwi (step);
2507       /* Check that STEP is a multiple of type size.  Otherwise there is
2508          a non-element-sized gap at the end of the group which we
2509 	 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2510 	 ???  As we can handle non-constant step fine here we should
2511 	 simply remove uses of DR_GROUP_GAP between the last and first
2512 	 element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2513 	 simply not include that gap.  */
2514       if ((dr_step % type_size) != 0)
2515 	{
2516 	  if (dump_enabled_p ())
2517 	    dump_printf_loc (MSG_NOTE, vect_location,
2518 			     "Step %T is not a multiple of the element size"
2519 			     " for %T\n",
2520 			     step, DR_REF (dr));
2521 	  return false;
2522 	}
2523       groupsize = absu_hwi (dr_step) / type_size;
2524     }
2525   else
2526     groupsize = 0;
2527 
2528   /* Not consecutive access is possible only if it is a part of interleaving.  */
2529   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2530     {
2531       /* Check if it this DR is a part of interleaving, and is a single
2532 	 element of the group that is accessed in the loop.  */
2533 
2534       /* Gaps are supported only for loads. STEP must be a multiple of the type
2535 	 size.  */
2536       if (DR_IS_READ (dr)
2537 	  && (dr_step % type_size) == 0
2538 	  && groupsize > 0)
2539 	{
2540 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2541 	  DR_GROUP_SIZE (stmt_info) = groupsize;
2542 	  DR_GROUP_GAP (stmt_info) = groupsize - 1;
2543 	  if (dump_enabled_p ())
2544 	    dump_printf_loc (MSG_NOTE, vect_location,
2545 			     "Detected single element interleaving %T"
2546 			     " step %T\n",
2547 			     DR_REF (dr), step);
2548 
2549 	  return true;
2550 	}
2551 
2552       if (dump_enabled_p ())
2553 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554 			 "not consecutive access %G", stmt_info->stmt);
2555 
2556       if (bb_vinfo)
2557 	{
2558 	  /* Mark the statement as unvectorizable.  */
2559 	  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2560 	  return true;
2561 	}
2562 
2563       if (dump_enabled_p ())
2564 	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2565       STMT_VINFO_STRIDED_P (stmt_info) = true;
2566       return true;
2567     }
2568 
2569   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2570     {
2571       /* First stmt in the interleaving chain. Check the chain.  */
2572       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2573       struct data_reference *data_ref = dr;
2574       unsigned int count = 1;
2575       tree prev_init = DR_INIT (data_ref);
2576       HOST_WIDE_INT diff, gaps = 0;
2577 
2578       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2579       while (next)
2580         {
2581           /* We never have the same DR multiple times.  */
2582           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2583 				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2584 
2585 	  data_ref = STMT_VINFO_DATA_REF (next);
2586 
2587 	  /* All group members have the same STEP by construction.  */
2588 	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2589 
2590           /* Check that the distance between two accesses is equal to the type
2591              size. Otherwise, we have gaps.  */
2592           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2593                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2594 	  if (diff != 1)
2595 	    {
2596 	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2597 	      slp_impossible = true;
2598 	      if (DR_IS_WRITE (data_ref))
2599 		{
2600                   if (dump_enabled_p ())
2601                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2602                                      "interleaved store with gaps\n");
2603 		  return false;
2604 		}
2605 
2606               gaps += diff - 1;
2607 	    }
2608 
2609 	  last_accessed_element += diff;
2610 
2611           /* Store the gap from the previous member of the group. If there is no
2612              gap in the access, DR_GROUP_GAP is always 1.  */
2613 	  DR_GROUP_GAP (next) = diff;
2614 
2615 	  prev_init = DR_INIT (data_ref);
2616 	  next = DR_GROUP_NEXT_ELEMENT (next);
2617 	  /* Count the number of data-refs in the chain.  */
2618 	  count++;
2619         }
2620 
2621       if (groupsize == 0)
2622         groupsize = count + gaps;
2623 
2624       /* This could be UINT_MAX but as we are generating code in a very
2625          inefficient way we have to cap earlier.  See PR78699 for example.  */
2626       if (groupsize > 4096)
2627 	{
2628 	  if (dump_enabled_p ())
2629 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630 			     "group is too large\n");
2631 	  return false;
2632 	}
2633 
2634       /* Check that the size of the interleaving is equal to count for stores,
2635          i.e., that there are no gaps.  */
2636       if (groupsize != count
2637 	  && !DR_IS_READ (dr))
2638         {
2639 	  groupsize = count;
2640 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
2641 	}
2642 
2643       /* If there is a gap after the last load in the group it is the
2644 	 difference between the groupsize and the last accessed
2645 	 element.
2646 	 When there is no gap, this difference should be 0.  */
2647       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2648 
2649       DR_GROUP_SIZE (stmt_info) = groupsize;
2650       if (dump_enabled_p ())
2651 	{
2652 	  dump_printf_loc (MSG_NOTE, vect_location,
2653 			   "Detected interleaving ");
2654 	  if (DR_IS_READ (dr))
2655 	    dump_printf (MSG_NOTE, "load ");
2656 	  else if (STMT_VINFO_STRIDED_P (stmt_info))
2657 	    dump_printf (MSG_NOTE, "strided store ");
2658 	  else
2659 	    dump_printf (MSG_NOTE, "store ");
2660 	  dump_printf (MSG_NOTE, "of size %u\n",
2661 		       (unsigned)groupsize);
2662 	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2663 	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2664 	  while (next)
2665 	    {
2666 	      if (DR_GROUP_GAP (next) != 1)
2667 		dump_printf_loc (MSG_NOTE, vect_location,
2668 				 "\t<gap of %d elements>\n",
2669 				 DR_GROUP_GAP (next) - 1);
2670 	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2671 	      next = DR_GROUP_NEXT_ELEMENT (next);
2672 	    }
2673 	  if (DR_GROUP_GAP (stmt_info) != 0)
2674 	    dump_printf_loc (MSG_NOTE, vect_location,
2675 			     "\t<gap of %d elements>\n",
2676 			     DR_GROUP_GAP (stmt_info));
2677 	}
2678 
2679       /* SLP: create an SLP data structure for every interleaving group of
2680 	 stores for further analysis in vect_analyse_slp.  */
2681       if (DR_IS_WRITE (dr) && !slp_impossible)
2682 	{
2683 	  if (loop_vinfo)
2684 	    LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2685 	  if (bb_vinfo)
2686 	    BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2687 	}
2688     }
2689 
2690   return true;
2691 }
2692 
2693 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2694    accesses of legal size, step, etc.  Detect gaps, single element
2695    interleaving, and other special cases. Set grouped access info.
2696    Collect groups of strided stores for further use in SLP analysis.  */
2697 
2698 static bool
vect_analyze_group_access(dr_vec_info * dr_info)2699 vect_analyze_group_access (dr_vec_info *dr_info)
2700 {
2701   if (!vect_analyze_group_access_1 (dr_info))
2702     {
2703       /* Dissolve the group if present.  */
2704       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2705       while (stmt_info)
2706 	{
2707 	  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2709 	  DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2710 	  stmt_info = next;
2711 	}
2712       return false;
2713     }
2714   return true;
2715 }
2716 
2717 /* Analyze the access pattern of the data-reference DR_INFO.
2718    In case of non-consecutive accesses call vect_analyze_group_access() to
2719    analyze groups of accesses.  */
2720 
2721 static bool
vect_analyze_data_ref_access(dr_vec_info * dr_info)2722 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2723 {
2724   data_reference *dr = dr_info->dr;
2725   tree step = DR_STEP (dr);
2726   tree scalar_type = TREE_TYPE (DR_REF (dr));
2727   stmt_vec_info stmt_info = dr_info->stmt;
2728   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2729   class loop *loop = NULL;
2730 
2731   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2732     return true;
2733 
2734   if (loop_vinfo)
2735     loop = LOOP_VINFO_LOOP (loop_vinfo);
2736 
2737   if (loop_vinfo && !step)
2738     {
2739       if (dump_enabled_p ())
2740 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2741 	                 "bad data-ref access in loop\n");
2742       return false;
2743     }
2744 
2745   /* Allow loads with zero step in inner-loop vectorization.  */
2746   if (loop_vinfo && integer_zerop (step))
2747     {
2748       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2749       if (!nested_in_vect_loop_p (loop, stmt_info))
2750 	return DR_IS_READ (dr);
2751       /* Allow references with zero step for outer loops marked
2752 	 with pragma omp simd only - it guarantees absence of
2753 	 loop-carried dependencies between inner loop iterations.  */
2754       if (loop->safelen < 2)
2755 	{
2756 	  if (dump_enabled_p ())
2757 	    dump_printf_loc (MSG_NOTE, vect_location,
2758 			     "zero step in inner loop of nest\n");
2759 	  return false;
2760 	}
2761     }
2762 
2763   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2764     {
2765       /* Interleaved accesses are not yet supported within outer-loop
2766         vectorization for references in the inner-loop.  */
2767       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2768 
2769       /* For the rest of the analysis we use the outer-loop step.  */
2770       step = STMT_VINFO_DR_STEP (stmt_info);
2771       if (integer_zerop (step))
2772 	{
2773 	  if (dump_enabled_p ())
2774 	    dump_printf_loc (MSG_NOTE, vect_location,
2775 	                     "zero step in outer loop.\n");
2776 	  return DR_IS_READ (dr);
2777 	}
2778     }
2779 
2780   /* Consecutive?  */
2781   if (TREE_CODE (step) == INTEGER_CST)
2782     {
2783       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2784       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2785 	  || (dr_step < 0
2786 	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2787 	{
2788 	  /* Mark that it is not interleaving.  */
2789 	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2790 	  return true;
2791 	}
2792     }
2793 
2794   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2795     {
2796       if (dump_enabled_p ())
2797 	dump_printf_loc (MSG_NOTE, vect_location,
2798 	                 "grouped access in outer loop.\n");
2799       return false;
2800     }
2801 
2802 
2803   /* Assume this is a DR handled by non-constant strided load case.  */
2804   if (TREE_CODE (step) != INTEGER_CST)
2805     return (STMT_VINFO_STRIDED_P (stmt_info)
2806 	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807 		|| vect_analyze_group_access (dr_info)));
2808 
2809   /* Not consecutive access - check if it's a part of interleaving group.  */
2810   return vect_analyze_group_access (dr_info);
2811 }
2812 
2813 /* Compare two data-references DRA and DRB to group them into chunks
2814    suitable for grouping.  */
2815 
2816 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2817 dr_group_sort_cmp (const void *dra_, const void *drb_)
2818 {
2819   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2820   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2821   int cmp;
2822 
2823   /* Stabilize sort.  */
2824   if (dra == drb)
2825     return 0;
2826 
2827   /* DRs in different loops never belong to the same group.  */
2828   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2829   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2830   if (loopa != loopb)
2831     return loopa->num < loopb->num ? -1 : 1;
2832 
2833   /* Ordering of DRs according to base.  */
2834   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2835 			       DR_BASE_ADDRESS (drb));
2836   if (cmp != 0)
2837     return cmp;
2838 
2839   /* And according to DR_OFFSET.  */
2840   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2841   if (cmp != 0)
2842     return cmp;
2843 
2844   /* Put reads before writes.  */
2845   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2846     return DR_IS_READ (dra) ? -1 : 1;
2847 
2848   /* Then sort after access size.  */
2849   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2850 			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2851   if (cmp != 0)
2852     return cmp;
2853 
2854   /* And after step.  */
2855   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2856   if (cmp != 0)
2857     return cmp;
2858 
2859   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2860   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2861   if (cmp == 0)
2862     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2863   return cmp;
2864 }
2865 
2866 /* If OP is the result of a conversion, return the unconverted value,
2867    otherwise return null.  */
2868 
2869 static tree
strip_conversion(tree op)2870 strip_conversion (tree op)
2871 {
2872   if (TREE_CODE (op) != SSA_NAME)
2873     return NULL_TREE;
2874   gimple *stmt = SSA_NAME_DEF_STMT (op);
2875   if (!is_gimple_assign (stmt)
2876       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2877     return NULL_TREE;
2878   return gimple_assign_rhs1 (stmt);
2879 }
2880 
2881 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2882    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2883    be grouped in SLP mode.  */
2884 
2885 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info,bool allow_slp_p)2886 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2887 		   bool allow_slp_p)
2888 {
2889   if (gimple_assign_single_p (stmt1_info->stmt))
2890     return gimple_assign_single_p (stmt2_info->stmt);
2891 
2892   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2893   if (call1 && gimple_call_internal_p (call1))
2894     {
2895       /* Check for two masked loads or two masked stores.  */
2896       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2897       if (!call2 || !gimple_call_internal_p (call2))
2898 	return false;
2899       internal_fn ifn = gimple_call_internal_fn (call1);
2900       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2901 	return false;
2902       if (ifn != gimple_call_internal_fn (call2))
2903 	return false;
2904 
2905       /* Check that the masks are the same.  Cope with casts of masks,
2906 	 like those created by build_mask_conversion.  */
2907       tree mask1 = gimple_call_arg (call1, 2);
2908       tree mask2 = gimple_call_arg (call2, 2);
2909       if (!operand_equal_p (mask1, mask2, 0)
2910           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2911 	{
2912 	  mask1 = strip_conversion (mask1);
2913 	  if (!mask1)
2914 	    return false;
2915 	  mask2 = strip_conversion (mask2);
2916 	  if (!mask2)
2917 	    return false;
2918 	  if (!operand_equal_p (mask1, mask2, 0))
2919 	    return false;
2920 	}
2921       return true;
2922     }
2923 
2924   return false;
2925 }
2926 
2927 /* Function vect_analyze_data_ref_accesses.
2928 
2929    Analyze the access pattern of all the data references in the loop.
2930 
2931    FORNOW: the only access pattern that is considered vectorizable is a
2932 	   simple step 1 (consecutive) access.
2933 
2934    FORNOW: handle only arrays and pointer accesses.  */
2935 
2936 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo)2937 vect_analyze_data_ref_accesses (vec_info *vinfo)
2938 {
2939   unsigned int i;
2940   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2941   struct data_reference *dr;
2942 
2943   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2944 
2945   if (datarefs.is_empty ())
2946     return opt_result::success ();
2947 
2948   /* Sort the array of datarefs to make building the interleaving chains
2949      linear.  Don't modify the original vector's order, it is needed for
2950      determining what dependencies are reversed.  */
2951   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2952   datarefs_copy.qsort (dr_group_sort_cmp);
2953   hash_set<stmt_vec_info> to_fixup;
2954 
2955   /* Build the interleaving chains.  */
2956   for (i = 0; i < datarefs_copy.length () - 1;)
2957     {
2958       data_reference_p dra = datarefs_copy[i];
2959       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2960       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2961       stmt_vec_info lastinfo = NULL;
2962       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2963 	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2964 	{
2965 	  ++i;
2966 	  continue;
2967 	}
2968       for (i = i + 1; i < datarefs_copy.length (); ++i)
2969 	{
2970 	  data_reference_p drb = datarefs_copy[i];
2971 	  dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2972 	  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2973 	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2974 	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2975 	    break;
2976 
2977 	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2978 	     accesses, same accesses) can lead to a group to be artificially
2979 	     split here as we don't just skip over those.  If it really
2980 	     matters we can push those to a worklist and re-iterate
2981 	     over them.  The we can just skip ahead to the next DR here.  */
2982 
2983 	  /* DRs in a different loop should not be put into the same
2984 	     interleaving group.  */
2985 	  if (gimple_bb (DR_STMT (dra))->loop_father
2986 	      != gimple_bb (DR_STMT (drb))->loop_father)
2987 	    break;
2988 
2989 	  /* Check that the data-refs have same first location (except init)
2990 	     and they are both either store or load (not load and store,
2991 	     not masked loads or stores).  */
2992 	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2993 	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2994 					DR_BASE_ADDRESS (drb)) != 0
2995 	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2996 	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2997 	    break;
2998 
2999 	  /* Check that the data-refs have the same constant size.  */
3000 	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3001 	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3002 	  if (!tree_fits_uhwi_p (sza)
3003 	      || !tree_fits_uhwi_p (szb)
3004 	      || !tree_int_cst_equal (sza, szb))
3005 	    break;
3006 
3007 	  /* Check that the data-refs have the same step.  */
3008 	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3009 	    break;
3010 
3011 	  /* Check the types are compatible.
3012 	     ???  We don't distinguish this during sorting.  */
3013 	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3014 				   TREE_TYPE (DR_REF (drb))))
3015 	    break;
3016 
3017 	  /* Check that the DR_INITs are compile-time constants.  */
3018 	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3019 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3020 	    break;
3021 
3022 	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
3023 	     just hold extra information.  */
3024 	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3025 	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3026 	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3027 	    break;
3028 
3029 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3030 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3031 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3032 	  HOST_WIDE_INT init_prev
3033 	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3034 	  gcc_assert (init_a <= init_b
3035 		      && init_a <= init_prev
3036 		      && init_prev <= init_b);
3037 
3038 	  /* Do not place the same access in the interleaving chain twice.  */
3039 	  if (init_b == init_prev)
3040 	    {
3041 	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3042 			  < gimple_uid (DR_STMT (drb)));
3043 	      /* Simply link in duplicates and fix up the chain below.  */
3044 	    }
3045 	  else
3046 	    {
3047 	      /* If init_b == init_a + the size of the type * k, we have an
3048 		 interleaving, and DRA is accessed before DRB.  */
3049 	      HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3050 	      if (type_size_a == 0
3051 		  || (init_b - init_a) % type_size_a != 0)
3052 		break;
3053 
3054 	      /* If we have a store, the accesses are adjacent.  This splits
3055 		 groups into chunks we support (we don't support vectorization
3056 		 of stores with gaps).  */
3057 	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3058 		break;
3059 
3060 	      /* If the step (if not zero or non-constant) is greater than the
3061 		 difference between data-refs' inits this splits groups into
3062 		 suitable sizes.  */
3063 	      if (tree_fits_shwi_p (DR_STEP (dra)))
3064 		{
3065 		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3066 		  if (step != 0 && step <= (init_b - init_a))
3067 		    break;
3068 		}
3069 	    }
3070 
3071 	  if (dump_enabled_p ())
3072 	    dump_printf_loc (MSG_NOTE, vect_location,
3073 			     DR_IS_READ (dra)
3074 			     ? "Detected interleaving load %T and %T\n"
3075 			     : "Detected interleaving store %T and %T\n",
3076 			     DR_REF (dra), DR_REF (drb));
3077 
3078 	  /* Link the found element into the group list.  */
3079 	  if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3080 	    {
3081 	      DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3082 	      lastinfo = stmtinfo_a;
3083 	    }
3084 	  DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3085 	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3086 	  lastinfo = stmtinfo_b;
3087 
3088 	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3089 	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3090 
3091 	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3092 	    dump_printf_loc (MSG_NOTE, vect_location,
3093 			     "Load suitable for SLP vectorization only.\n");
3094 
3095 	  if (init_b == init_prev
3096 	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3097 	      && dump_enabled_p ())
3098 	    dump_printf_loc (MSG_NOTE, vect_location,
3099 			     "Queuing group with duplicate access for fixup\n");
3100 	}
3101     }
3102 
3103   /* Fixup groups with duplicate entries by splitting it.  */
3104   while (1)
3105     {
3106       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3107       if (!(it != to_fixup.end ()))
3108 	break;
3109       stmt_vec_info grp = *it;
3110       to_fixup.remove (grp);
3111 
3112       /* Find the earliest duplicate group member.  */
3113       unsigned first_duplicate = -1u;
3114       stmt_vec_info next, g = grp;
3115       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3116 	{
3117 	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3118 				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3119 	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3120 	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3121 	  g = next;
3122 	}
3123       if (first_duplicate == -1U)
3124 	continue;
3125 
3126       /* Then move all stmts after the first duplicate to a new group.
3127          Note this is a heuristic but one with the property that *it
3128 	 is fixed up completely.  */
3129       g = grp;
3130       stmt_vec_info newgroup = NULL, ng = grp;
3131       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3132 	{
3133 	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3134 	    {
3135 	      DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3136 	      if (!newgroup)
3137 		newgroup = next;
3138 	      else
3139 		DR_GROUP_NEXT_ELEMENT (ng) = next;
3140 	      ng = next;
3141 	      DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3142 	    }
3143 	  else
3144 	    g = DR_GROUP_NEXT_ELEMENT (g);
3145 	}
3146       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3147 
3148       /* Fixup the new group which still may contain duplicates.  */
3149       to_fixup.add (newgroup);
3150     }
3151 
3152   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3153     {
3154       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3155       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3156 	  && !vect_analyze_data_ref_access (dr_info))
3157 	{
3158 	  if (dump_enabled_p ())
3159 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3160 			     "not vectorized: complicated access pattern.\n");
3161 
3162 	  if (is_a <bb_vec_info> (vinfo))
3163 	    {
3164 	      /* Mark the statement as not vectorizable.  */
3165 	      STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3166 	      continue;
3167 	    }
3168 	  else
3169 	    {
3170 	      datarefs_copy.release ();
3171 	      return opt_result::failure_at (dr_info->stmt->stmt,
3172 					     "not vectorized:"
3173 					     " complicated access pattern.\n");
3174 	    }
3175 	}
3176     }
3177 
3178   datarefs_copy.release ();
3179   return opt_result::success ();
3180 }
3181 
3182 /* Function vect_vfa_segment_size.
3183 
3184    Input:
3185      DR_INFO: The data reference.
3186      LENGTH_FACTOR: segment length to consider.
3187 
3188    Return a value suitable for the dr_with_seg_len::seg_len field.
3189    This is the "distance travelled" by the pointer from the first
3190    iteration in the segment to the last.  Note that it does not include
3191    the size of the access; in effect it only describes the first byte.  */
3192 
3193 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3194 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3195 {
3196   length_factor = size_binop (MINUS_EXPR,
3197 			      fold_convert (sizetype, length_factor),
3198 			      size_one_node);
3199   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3200 		     length_factor);
3201 }
3202 
3203 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3204    gives the worst-case number of bytes covered by the segment.  */
3205 
3206 static unsigned HOST_WIDE_INT
vect_vfa_access_size(dr_vec_info * dr_info)3207 vect_vfa_access_size (dr_vec_info *dr_info)
3208 {
3209   stmt_vec_info stmt_vinfo = dr_info->stmt;
3210   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3211   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3212   unsigned HOST_WIDE_INT access_size = ref_size;
3213   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3214     {
3215       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3216       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3217     }
3218   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3219       && (vect_supportable_dr_alignment (dr_info, false)
3220 	  == dr_explicit_realign_optimized))
3221     {
3222       /* We might access a full vector's worth.  */
3223       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3224       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3225     }
3226   return access_size;
3227 }
3228 
3229 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3230    describes.  */
3231 
3232 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3233 vect_vfa_align (dr_vec_info *dr_info)
3234 {
3235   return dr_alignment (dr_info->dr);
3236 }
3237 
3238 /* Function vect_no_alias_p.
3239 
3240    Given data references A and B with equal base and offset, see whether
3241    the alias relation can be decided at compilation time.  Return 1 if
3242    it can and the references alias, 0 if it can and the references do
3243    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3244    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3245    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3246 
3247 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3248 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3249 			 tree segment_length_a, tree segment_length_b,
3250 			 unsigned HOST_WIDE_INT access_size_a,
3251 			 unsigned HOST_WIDE_INT access_size_b)
3252 {
3253   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3254   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3255   poly_uint64 const_length_a;
3256   poly_uint64 const_length_b;
3257 
3258   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3259      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3260      [a, a+12) */
3261   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3262     {
3263       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3264       offset_a -= const_length_a;
3265     }
3266   else
3267     const_length_a = tree_to_poly_uint64 (segment_length_a);
3268   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3269     {
3270       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3271       offset_b -= const_length_b;
3272     }
3273   else
3274     const_length_b = tree_to_poly_uint64 (segment_length_b);
3275 
3276   const_length_a += access_size_a;
3277   const_length_b += access_size_b;
3278 
3279   if (ranges_known_overlap_p (offset_a, const_length_a,
3280 			      offset_b, const_length_b))
3281     return 1;
3282 
3283   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3284 			       offset_b, const_length_b))
3285     return 0;
3286 
3287   return -1;
3288 }
3289 
3290 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3291    in DDR is >= VF.  */
3292 
3293 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3294 dependence_distance_ge_vf (data_dependence_relation *ddr,
3295 			   unsigned int loop_depth, poly_uint64 vf)
3296 {
3297   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3298       || DDR_NUM_DIST_VECTS (ddr) == 0)
3299     return false;
3300 
3301   /* If the dependence is exact, we should have limited the VF instead.  */
3302   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3303 
3304   unsigned int i;
3305   lambda_vector dist_v;
3306   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3307     {
3308       HOST_WIDE_INT dist = dist_v[loop_depth];
3309       if (dist != 0
3310 	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3311 	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3312 	return false;
3313     }
3314 
3315   if (dump_enabled_p ())
3316     dump_printf_loc (MSG_NOTE, vect_location,
3317 		     "dependence distance between %T and %T is >= VF\n",
3318 		     DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3319 
3320   return true;
3321 }
3322 
3323 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3324 
3325 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3326 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3327 {
3328   dump_printf (dump_kind, "%s (%T) >= ",
3329 	       lower_bound.unsigned_p ? "unsigned" : "abs",
3330 	       lower_bound.expr);
3331   dump_dec (dump_kind, lower_bound.min_value);
3332 }
3333 
3334 /* Record that the vectorized loop requires the vec_lower_bound described
3335    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3336 
3337 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3338 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3339 			poly_uint64 min_value)
3340 {
3341   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3342   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3343     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3344       {
3345 	unsigned_p &= lower_bounds[i].unsigned_p;
3346 	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3347 	if (lower_bounds[i].unsigned_p != unsigned_p
3348 	    || maybe_lt (lower_bounds[i].min_value, min_value))
3349 	  {
3350 	    lower_bounds[i].unsigned_p = unsigned_p;
3351 	    lower_bounds[i].min_value = min_value;
3352 	    if (dump_enabled_p ())
3353 	      {
3354 		dump_printf_loc (MSG_NOTE, vect_location,
3355 				 "updating run-time check to ");
3356 		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3357 		dump_printf (MSG_NOTE, "\n");
3358 	      }
3359 	  }
3360 	return;
3361       }
3362 
3363   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3364   if (dump_enabled_p ())
3365     {
3366       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3367       dump_lower_bound (MSG_NOTE, lower_bound);
3368       dump_printf (MSG_NOTE, "\n");
3369     }
3370   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3371 }
3372 
3373 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3374    will span fewer than GAP bytes.  */
3375 
3376 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3377 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3378 		  poly_int64 gap)
3379 {
3380   stmt_vec_info stmt_info = dr_info->stmt;
3381   HOST_WIDE_INT count
3382     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3383   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3384     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3385   return (estimated_poly_value (gap)
3386 	  <= count * vect_get_scalar_dr_size (dr_info));
3387 }
3388 
3389 /* Return true if we know that there is no alias between DR_INFO_A and
3390    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3391    When returning true, set *LOWER_BOUND_OUT to this N.  */
3392 
3393 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3394 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3395 				poly_uint64 *lower_bound_out)
3396 {
3397   /* Check that there is a constant gap of known sign between DR_A
3398      and DR_B.  */
3399   data_reference *dr_a = dr_info_a->dr;
3400   data_reference *dr_b = dr_info_b->dr;
3401   poly_int64 init_a, init_b;
3402   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3403       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3404       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3405       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3406       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3407       || !ordered_p (init_a, init_b))
3408     return false;
3409 
3410   /* Sort DR_A and DR_B by the address they access.  */
3411   if (maybe_lt (init_b, init_a))
3412     {
3413       std::swap (init_a, init_b);
3414       std::swap (dr_info_a, dr_info_b);
3415       std::swap (dr_a, dr_b);
3416     }
3417 
3418   /* If the two accesses could be dependent within a scalar iteration,
3419      make sure that we'd retain their order.  */
3420   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3421       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3422     return false;
3423 
3424   /* There is no alias if abs (DR_STEP) is greater than or equal to
3425      the bytes spanned by the combination of the two accesses.  */
3426   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3427   return true;
3428 }
3429 
3430 /* Function vect_prune_runtime_alias_test_list.
3431 
3432    Prune a list of ddrs to be tested at run-time by versioning for alias.
3433    Merge several alias checks into one if possible.
3434    Return FALSE if resulting list of ddrs is longer then allowed by
3435    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3436 
3437 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3438 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3439 {
3440   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3441   hash_set <tree_pair_hash> compared_objects;
3442 
3443   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3444   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3445     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3446   vec<vec_object_pair> &check_unequal_addrs
3447     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3448   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3449   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3450 
3451   ddr_p ddr;
3452   unsigned int i;
3453   tree length_factor;
3454 
3455   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3456 
3457   /* Step values are irrelevant for aliasing if the number of vector
3458      iterations is equal to the number of scalar iterations (which can
3459      happen for fully-SLP loops).  */
3460   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3461 
3462   if (!vf_one_p)
3463     {
3464       /* Convert the checks for nonzero steps into bound tests.  */
3465       tree value;
3466       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3467 	vect_check_lower_bound (loop_vinfo, value, true, 1);
3468     }
3469 
3470   if (may_alias_ddrs.is_empty ())
3471     return opt_result::success ();
3472 
3473   comp_alias_ddrs.create (may_alias_ddrs.length ());
3474 
3475   unsigned int loop_depth
3476     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3477 			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3478 
3479   /* First, we collect all data ref pairs for aliasing checks.  */
3480   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3481     {
3482       poly_uint64 lower_bound;
3483       tree segment_length_a, segment_length_b;
3484       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3485       unsigned int align_a, align_b;
3486 
3487       /* Ignore the alias if the VF we chose ended up being no greater
3488 	 than the dependence distance.  */
3489       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3490 	continue;
3491 
3492       if (DDR_OBJECT_A (ddr))
3493 	{
3494 	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3495 	  if (!compared_objects.add (new_pair))
3496 	    {
3497 	      if (dump_enabled_p ())
3498 		dump_printf_loc (MSG_NOTE, vect_location,
3499 				 "checking that %T and %T"
3500 				 " have different addresses\n",
3501 				 new_pair.first, new_pair.second);
3502 	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3503 	    }
3504 	  continue;
3505 	}
3506 
3507       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3508       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3509 
3510       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3511       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3512 
3513       bool preserves_scalar_order_p
3514 	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3515       bool ignore_step_p
3516 	  = (vf_one_p
3517 	     && (preserves_scalar_order_p
3518 		 || operand_equal_p (DR_STEP (dr_info_a->dr),
3519 				     DR_STEP (dr_info_b->dr))));
3520 
3521       /* Skip the pair if inter-iteration dependencies are irrelevant
3522 	 and intra-iteration dependencies are guaranteed to be honored.  */
3523       if (ignore_step_p
3524 	  && (preserves_scalar_order_p
3525 	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3526 						 &lower_bound)))
3527 	{
3528 	  if (dump_enabled_p ())
3529 	    dump_printf_loc (MSG_NOTE, vect_location,
3530 			     "no need for alias check between "
3531 			     "%T and %T when VF is 1\n",
3532 			     DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3533 	  continue;
3534 	}
3535 
3536       /* See whether we can handle the alias using a bounds check on
3537 	 the step, and whether that's likely to be the best approach.
3538 	 (It might not be, for example, if the minimum step is much larger
3539 	 than the number of bytes handled by one vector iteration.)  */
3540       if (!ignore_step_p
3541 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3542 	  && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3543 					     &lower_bound)
3544 	  && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3545 	      || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3546 	{
3547 	  bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3548 	  if (dump_enabled_p ())
3549 	    {
3550 	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3551 			       "%T and %T when the step %T is outside ",
3552 			       DR_REF (dr_info_a->dr),
3553 			       DR_REF (dr_info_b->dr),
3554 			       DR_STEP (dr_info_a->dr));
3555 	      if (unsigned_p)
3556 		dump_printf (MSG_NOTE, "[0");
3557 	      else
3558 		{
3559 		  dump_printf (MSG_NOTE, "(");
3560 		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3561 		}
3562 	      dump_printf (MSG_NOTE, ", ");
3563 	      dump_dec (MSG_NOTE, lower_bound);
3564 	      dump_printf (MSG_NOTE, ")\n");
3565 	    }
3566 	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3567 				  unsigned_p, lower_bound);
3568 	  continue;
3569 	}
3570 
3571       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3572       if (dr_group_first_a)
3573 	{
3574 	  stmt_info_a = dr_group_first_a;
3575 	  dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3576 	}
3577 
3578       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3579       if (dr_group_first_b)
3580 	{
3581 	  stmt_info_b = dr_group_first_b;
3582 	  dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3583 	}
3584 
3585       if (ignore_step_p)
3586 	{
3587 	  segment_length_a = size_zero_node;
3588 	  segment_length_b = size_zero_node;
3589 	}
3590       else
3591 	{
3592 	  if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3593 				DR_STEP (dr_info_b->dr), 0))
3594 	    length_factor = scalar_loop_iters;
3595 	  else
3596 	    length_factor = size_int (vect_factor);
3597 	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3598 	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3599 	}
3600       access_size_a = vect_vfa_access_size (dr_info_a);
3601       access_size_b = vect_vfa_access_size (dr_info_b);
3602       align_a = vect_vfa_align (dr_info_a);
3603       align_b = vect_vfa_align (dr_info_b);
3604 
3605       /* See whether the alias is known at compilation time.  */
3606       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3607 			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
3608 	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3609 			      DR_OFFSET (dr_info_b->dr), 0)
3610 	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3611 	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3612 	  && poly_int_tree_p (segment_length_a)
3613 	  && poly_int_tree_p (segment_length_b))
3614 	{
3615 	  int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3616 					     segment_length_a,
3617 					     segment_length_b,
3618 					     access_size_a,
3619 					     access_size_b);
3620 	  if (res >= 0 && dump_enabled_p ())
3621 	    {
3622 	      dump_printf_loc (MSG_NOTE, vect_location,
3623 			       "can tell at compile time that %T and %T",
3624 			       DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3625 	      if (res == 0)
3626 		dump_printf (MSG_NOTE, " do not alias\n");
3627 	      else
3628 		dump_printf (MSG_NOTE, " alias\n");
3629 	    }
3630 
3631 	  if (res == 0)
3632 	    continue;
3633 
3634 	  if (res == 1)
3635 	    return opt_result::failure_at (stmt_info_b->stmt,
3636 					   "not vectorized:"
3637 					   " compilation time alias: %G%G",
3638 					   stmt_info_a->stmt,
3639 					   stmt_info_b->stmt);
3640 	}
3641 
3642       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3643 			    access_size_a, align_a);
3644       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3645 			    access_size_b, align_b);
3646       /* Canonicalize the order to be the one that's needed for accurate
3647 	 RAW, WAR and WAW flags, in cases where the data references are
3648 	 well-ordered.  The order doesn't really matter otherwise,
3649 	 but we might as well be consistent.  */
3650       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3651 	std::swap (dr_a, dr_b);
3652 
3653       dr_with_seg_len_pair_t dr_with_seg_len_pair
3654 	(dr_a, dr_b, (preserves_scalar_order_p
3655 		      ? dr_with_seg_len_pair_t::WELL_ORDERED
3656 		      : dr_with_seg_len_pair_t::REORDERED));
3657 
3658       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3659     }
3660 
3661   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3662 
3663   unsigned int count = (comp_alias_ddrs.length ()
3664 			+ check_unequal_addrs.length ());
3665 
3666   if (dump_enabled_p ())
3667     dump_printf_loc (MSG_NOTE, vect_location,
3668 		     "improved number of alias checks from %d to %d\n",
3669 		     may_alias_ddrs.length (), count);
3670   unsigned limit = param_vect_max_version_for_alias_checks;
3671   if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3672     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3673   if (count > limit)
3674     return opt_result::failure_at
3675       (vect_location,
3676        "number of versioning for alias run-time tests exceeds %d "
3677        "(--param vect-max-version-for-alias-checks)\n", limit);
3678 
3679   return opt_result::success ();
3680 }
3681 
3682 /* Check whether we can use an internal function for a gather load
3683    or scatter store.  READ_P is true for loads and false for stores.
3684    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3685    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3686    is the type of the offset that is being applied to the invariant
3687    base address.  SCALE is the amount by which the offset should
3688    be multiplied *after* it has been converted to address width.
3689 
3690    Return true if the function is supported, storing the function id in
3691    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3692 
3693 bool
vect_gather_scatter_fn_p(vec_info * vinfo,bool read_p,bool masked_p,tree vectype,tree memory_type,tree offset_type,int scale,internal_fn * ifn_out,tree * offset_vectype_out)3694 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3695 			  tree vectype, tree memory_type, tree offset_type,
3696 			  int scale, internal_fn *ifn_out,
3697 			  tree *offset_vectype_out)
3698 {
3699   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3700   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3701   if (element_bits != memory_bits)
3702     /* For now the vector elements must be the same width as the
3703        memory elements.  */
3704     return false;
3705 
3706   /* Work out which function we need.  */
3707   internal_fn ifn;
3708   if (read_p)
3709     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3710   else
3711     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3712 
3713   for (;;)
3714     {
3715       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3716       if (!offset_vectype)
3717 	return false;
3718 
3719       /* Test whether the target supports this combination.  */
3720       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3721 						  offset_vectype, scale))
3722 	{
3723 	  *ifn_out = ifn;
3724 	  *offset_vectype_out = offset_vectype;
3725 	  return true;
3726 	}
3727 
3728       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3729 	  && TYPE_PRECISION (offset_type) >= element_bits)
3730 	return false;
3731 
3732       offset_type = build_nonstandard_integer_type
3733 	(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3734     }
3735 }
3736 
3737 /* STMT_INFO is a call to an internal gather load or scatter store function.
3738    Describe the operation in INFO.  */
3739 
3740 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3741 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3742 				   gather_scatter_info *info)
3743 {
3744   gcall *call = as_a <gcall *> (stmt_info->stmt);
3745   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3746   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3747 
3748   info->ifn = gimple_call_internal_fn (call);
3749   info->decl = NULL_TREE;
3750   info->base = gimple_call_arg (call, 0);
3751   info->offset = gimple_call_arg (call, 1);
3752   info->offset_dt = vect_unknown_def_type;
3753   info->offset_vectype = NULL_TREE;
3754   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3755   info->element_type = TREE_TYPE (vectype);
3756   info->memory_type = TREE_TYPE (DR_REF (dr));
3757 }
3758 
3759 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3760    gather load or scatter store.  Describe the operation in *INFO if so.  */
3761 
3762 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3763 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3764 			   gather_scatter_info *info)
3765 {
3766   HOST_WIDE_INT scale = 1;
3767   poly_int64 pbitpos, pbitsize;
3768   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3769   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3770   tree offtype = NULL_TREE;
3771   tree decl = NULL_TREE, base, off;
3772   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3773   tree memory_type = TREE_TYPE (DR_REF (dr));
3774   machine_mode pmode;
3775   int punsignedp, reversep, pvolatilep = 0;
3776   internal_fn ifn;
3777   tree offset_vectype;
3778   bool masked_p = false;
3779 
3780   /* See whether this is already a call to a gather/scatter internal function.
3781      If not, see whether it's a masked load or store.  */
3782   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3783   if (call && gimple_call_internal_p (call))
3784     {
3785       ifn = gimple_call_internal_fn (call);
3786       if (internal_gather_scatter_fn_p (ifn))
3787 	{
3788 	  vect_describe_gather_scatter_call (stmt_info, info);
3789 	  return true;
3790 	}
3791       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3792     }
3793 
3794   /* True if we should aim to use internal functions rather than
3795      built-in functions.  */
3796   bool use_ifn_p = (DR_IS_READ (dr)
3797 		    ? supports_vec_gather_load_p ()
3798 		    : supports_vec_scatter_store_p ());
3799 
3800   base = DR_REF (dr);
3801   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3802      see if we can use the def stmt of the address.  */
3803   if (masked_p
3804       && TREE_CODE (base) == MEM_REF
3805       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3806       && integer_zerop (TREE_OPERAND (base, 1))
3807       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3808     {
3809       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3810       if (is_gimple_assign (def_stmt)
3811 	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3812 	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3813     }
3814 
3815   /* The gather and scatter builtins need address of the form
3816      loop_invariant + vector * {1, 2, 4, 8}
3817      or
3818      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3819      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3820      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3821      multiplications and additions in it.  To get a vector, we need
3822      a single SSA_NAME that will be defined in the loop and will
3823      contain everything that is not loop invariant and that can be
3824      vectorized.  The following code attempts to find such a preexistng
3825      SSA_NAME OFF and put the loop invariants into a tree BASE
3826      that can be gimplified before the loop.  */
3827   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3828 			      &punsignedp, &reversep, &pvolatilep);
3829   if (reversep)
3830     return false;
3831 
3832   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3833 
3834   if (TREE_CODE (base) == MEM_REF)
3835     {
3836       if (!integer_zerop (TREE_OPERAND (base, 1)))
3837 	{
3838 	  if (off == NULL_TREE)
3839 	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3840 	  else
3841 	    off = size_binop (PLUS_EXPR, off,
3842 			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3843 	}
3844       base = TREE_OPERAND (base, 0);
3845     }
3846   else
3847     base = build_fold_addr_expr (base);
3848 
3849   if (off == NULL_TREE)
3850     off = size_zero_node;
3851 
3852   /* If base is not loop invariant, either off is 0, then we start with just
3853      the constant offset in the loop invariant BASE and continue with base
3854      as OFF, otherwise give up.
3855      We could handle that case by gimplifying the addition of base + off
3856      into some SSA_NAME and use that as off, but for now punt.  */
3857   if (!expr_invariant_in_loop_p (loop, base))
3858     {
3859       if (!integer_zerop (off))
3860 	return false;
3861       off = base;
3862       base = size_int (pbytepos);
3863     }
3864   /* Otherwise put base + constant offset into the loop invariant BASE
3865      and continue with OFF.  */
3866   else
3867     {
3868       base = fold_convert (sizetype, base);
3869       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3870     }
3871 
3872   /* OFF at this point may be either a SSA_NAME or some tree expression
3873      from get_inner_reference.  Try to peel off loop invariants from it
3874      into BASE as long as possible.  */
3875   STRIP_NOPS (off);
3876   while (offtype == NULL_TREE)
3877     {
3878       enum tree_code code;
3879       tree op0, op1, add = NULL_TREE;
3880 
3881       if (TREE_CODE (off) == SSA_NAME)
3882 	{
3883 	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3884 
3885 	  if (expr_invariant_in_loop_p (loop, off))
3886 	    return false;
3887 
3888 	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3889 	    break;
3890 
3891 	  op0 = gimple_assign_rhs1 (def_stmt);
3892 	  code = gimple_assign_rhs_code (def_stmt);
3893 	  op1 = gimple_assign_rhs2 (def_stmt);
3894 	}
3895       else
3896 	{
3897 	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3898 	    return false;
3899 	  code = TREE_CODE (off);
3900 	  extract_ops_from_tree (off, &code, &op0, &op1);
3901 	}
3902       switch (code)
3903 	{
3904 	case POINTER_PLUS_EXPR:
3905 	case PLUS_EXPR:
3906 	  if (expr_invariant_in_loop_p (loop, op0))
3907 	    {
3908 	      add = op0;
3909 	      off = op1;
3910 	    do_add:
3911 	      add = fold_convert (sizetype, add);
3912 	      if (scale != 1)
3913 		add = size_binop (MULT_EXPR, add, size_int (scale));
3914 	      base = size_binop (PLUS_EXPR, base, add);
3915 	      continue;
3916 	    }
3917 	  if (expr_invariant_in_loop_p (loop, op1))
3918 	    {
3919 	      add = op1;
3920 	      off = op0;
3921 	      goto do_add;
3922 	    }
3923 	  break;
3924 	case MINUS_EXPR:
3925 	  if (expr_invariant_in_loop_p (loop, op1))
3926 	    {
3927 	      add = fold_convert (sizetype, op1);
3928 	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3929 	      off = op0;
3930 	      goto do_add;
3931 	    }
3932 	  break;
3933 	case MULT_EXPR:
3934 	  if (scale == 1 && tree_fits_shwi_p (op1))
3935 	    {
3936 	      int new_scale = tree_to_shwi (op1);
3937 	      /* Only treat this as a scaling operation if the target
3938 		 supports it for at least some offset type.  */
3939 	      if (use_ifn_p
3940 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3941 						masked_p, vectype, memory_type,
3942 						signed_char_type_node,
3943 						new_scale, &ifn,
3944 						&offset_vectype)
3945 		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3946 						masked_p, vectype, memory_type,
3947 						unsigned_char_type_node,
3948 						new_scale, &ifn,
3949 						&offset_vectype))
3950 		break;
3951 	      scale = new_scale;
3952 	      off = op0;
3953 	      continue;
3954 	    }
3955 	  break;
3956 	case SSA_NAME:
3957 	  off = op0;
3958 	  continue;
3959 	CASE_CONVERT:
3960 	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3961 	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3962 	    break;
3963 
3964 	  /* Don't include the conversion if the target is happy with
3965 	     the current offset type.  */
3966 	  if (use_ifn_p
3967 	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3968 					   masked_p, vectype, memory_type,
3969 					   TREE_TYPE (off), scale, &ifn,
3970 					   &offset_vectype))
3971 	    break;
3972 
3973 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3974 	      == TYPE_PRECISION (TREE_TYPE (off)))
3975 	    {
3976 	      off = op0;
3977 	      continue;
3978 	    }
3979 
3980 	  if (TYPE_PRECISION (TREE_TYPE (op0))
3981 	      < TYPE_PRECISION (TREE_TYPE (off)))
3982 	    {
3983 	      off = op0;
3984 	      offtype = TREE_TYPE (off);
3985 	      STRIP_NOPS (off);
3986 	      continue;
3987 	    }
3988 	  break;
3989 	default:
3990 	  break;
3991 	}
3992       break;
3993     }
3994 
3995   /* If at the end OFF still isn't a SSA_NAME or isn't
3996      defined in the loop, punt.  */
3997   if (TREE_CODE (off) != SSA_NAME
3998       || expr_invariant_in_loop_p (loop, off))
3999     return false;
4000 
4001   if (offtype == NULL_TREE)
4002     offtype = TREE_TYPE (off);
4003 
4004   if (use_ifn_p)
4005     {
4006       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4007 				     vectype, memory_type, offtype, scale,
4008 				     &ifn, &offset_vectype))
4009 	return false;
4010     }
4011   else
4012     {
4013       if (DR_IS_READ (dr))
4014 	{
4015 	  if (targetm.vectorize.builtin_gather)
4016 	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4017 	}
4018       else
4019 	{
4020 	  if (targetm.vectorize.builtin_scatter)
4021 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4022 	}
4023 
4024       if (!decl)
4025 	return false;
4026 
4027       ifn = IFN_LAST;
4028       /* The offset vector type will be read from DECL when needed.  */
4029       offset_vectype = NULL_TREE;
4030     }
4031 
4032   info->ifn = ifn;
4033   info->decl = decl;
4034   info->base = base;
4035   info->offset = off;
4036   info->offset_dt = vect_unknown_def_type;
4037   info->offset_vectype = offset_vectype;
4038   info->scale = scale;
4039   info->element_type = TREE_TYPE (vectype);
4040   info->memory_type = memory_type;
4041   return true;
4042 }
4043 
4044 /* Find the data references in STMT, analyze them with respect to LOOP and
4045    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4046    be handled.  */
4047 
4048 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs)4049 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4050 			       vec<data_reference_p> *datarefs)
4051 {
4052   /* We can ignore clobbers for dataref analysis - they are removed during
4053      loop vectorization and BB vectorization checks dependences with a
4054      stmt walk.  */
4055   if (gimple_clobber_p (stmt))
4056     return opt_result::success ();
4057 
4058   if (gimple_has_volatile_ops (stmt))
4059     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4060 				   stmt);
4061 
4062   if (stmt_can_throw_internal (cfun, stmt))
4063     return opt_result::failure_at (stmt,
4064 				   "not vectorized:"
4065 				   " statement can throw an exception: %G",
4066 				   stmt);
4067 
4068   auto_vec<data_reference_p, 2> refs;
4069   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4070   if (!res)
4071     return res;
4072 
4073   if (refs.is_empty ())
4074     return opt_result::success ();
4075 
4076   if (refs.length () > 1)
4077     return opt_result::failure_at (stmt,
4078 				   "not vectorized:"
4079 				   " more than one data ref in stmt: %G", stmt);
4080 
4081   if (gcall *call = dyn_cast <gcall *> (stmt))
4082     if (!gimple_call_internal_p (call)
4083 	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4084 	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4085       return opt_result::failure_at (stmt,
4086 				     "not vectorized: dr in a call %G", stmt);
4087 
4088   data_reference_p dr = refs.pop ();
4089   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4090       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4091     return opt_result::failure_at (stmt,
4092 				   "not vectorized:"
4093 				   " statement is bitfield access %G", stmt);
4094 
4095   if (DR_BASE_ADDRESS (dr)
4096       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4097     return opt_result::failure_at (stmt,
4098 				   "not vectorized:"
4099 				   " base addr of dr is a constant\n");
4100 
4101   /* Check whether this may be a SIMD lane access and adjust the
4102      DR to make it easier for us to handle it.  */
4103   if (loop
4104       && loop->simduid
4105       && (!DR_BASE_ADDRESS (dr)
4106 	  || !DR_OFFSET (dr)
4107 	  || !DR_INIT (dr)
4108 	  || !DR_STEP (dr)))
4109     {
4110       struct data_reference *newdr
4111 	= create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4112 			   DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4113       if (DR_BASE_ADDRESS (newdr)
4114 	  && DR_OFFSET (newdr)
4115 	  && DR_INIT (newdr)
4116 	  && DR_STEP (newdr)
4117 	  && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4118 	  && integer_zerop (DR_STEP (newdr)))
4119 	{
4120 	  tree base_address = DR_BASE_ADDRESS (newdr);
4121 	  tree off = DR_OFFSET (newdr);
4122 	  tree step = ssize_int (1);
4123 	  if (integer_zerop (off)
4124 	      && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4125 	    {
4126 	      off = TREE_OPERAND (base_address, 1);
4127 	      base_address = TREE_OPERAND (base_address, 0);
4128 	    }
4129 	  STRIP_NOPS (off);
4130 	  if (TREE_CODE (off) == MULT_EXPR
4131 	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4132 	    {
4133 	      step = TREE_OPERAND (off, 1);
4134 	      off = TREE_OPERAND (off, 0);
4135 	      STRIP_NOPS (off);
4136 	    }
4137 	  if (CONVERT_EXPR_P (off)
4138 	      && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4139 		  < TYPE_PRECISION (TREE_TYPE (off))))
4140 	    off = TREE_OPERAND (off, 0);
4141 	  if (TREE_CODE (off) == SSA_NAME)
4142 	    {
4143 	      gimple *def = SSA_NAME_DEF_STMT (off);
4144 	      /* Look through widening conversion.  */
4145 	      if (is_gimple_assign (def)
4146 		  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4147 		{
4148 		  tree rhs1 = gimple_assign_rhs1 (def);
4149 		  if (TREE_CODE (rhs1) == SSA_NAME
4150 		      && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4151 		      && (TYPE_PRECISION (TREE_TYPE (off))
4152 			  > TYPE_PRECISION (TREE_TYPE (rhs1))))
4153 		    def = SSA_NAME_DEF_STMT (rhs1);
4154 		}
4155 	      if (is_gimple_call (def)
4156 		  && gimple_call_internal_p (def)
4157 		  && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4158 		{
4159 		  tree arg = gimple_call_arg (def, 0);
4160 		  tree reft = TREE_TYPE (DR_REF (newdr));
4161 		  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4162 		  arg = SSA_NAME_VAR (arg);
4163 		  if (arg == loop->simduid
4164 		      /* For now.  */
4165 		      && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4166 		    {
4167 		      DR_BASE_ADDRESS (newdr) = base_address;
4168 		      DR_OFFSET (newdr) = ssize_int (0);
4169 		      DR_STEP (newdr) = step;
4170 		      DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4171 		      DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4172 		      /* Mark as simd-lane access.  */
4173 		      tree arg2 = gimple_call_arg (def, 1);
4174 		      newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4175 		      free_data_ref (dr);
4176 		      datarefs->safe_push (newdr);
4177 		      return opt_result::success ();
4178 		    }
4179 		}
4180 	    }
4181 	}
4182       free_data_ref (newdr);
4183     }
4184 
4185   datarefs->safe_push (dr);
4186   return opt_result::success ();
4187 }
4188 
4189 /* Function vect_analyze_data_refs.
4190 
4191   Find all the data references in the loop or basic block.
4192 
4193    The general structure of the analysis of data refs in the vectorizer is as
4194    follows:
4195    1- vect_analyze_data_refs(loop/bb): call
4196       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4197       in the loop/bb and their dependences.
4198    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4199    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4200    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4201 
4202 */
4203 
4204 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf,bool * fatal)4205 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4206 {
4207   class loop *loop = NULL;
4208   unsigned int i;
4209   struct data_reference *dr;
4210   tree scalar_type;
4211 
4212   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4213 
4214   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4215     loop = LOOP_VINFO_LOOP (loop_vinfo);
4216 
4217   /* Go through the data-refs, check that the analysis succeeded.  Update
4218      pointer from stmt_vec_info struct to DR and vectype.  */
4219 
4220   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4221   FOR_EACH_VEC_ELT (datarefs, i, dr)
4222     {
4223       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4224       poly_uint64 vf;
4225 
4226       gcc_assert (DR_REF (dr));
4227       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4228       gcc_assert (!stmt_info->dr_aux.dr);
4229       stmt_info->dr_aux.dr = dr;
4230       stmt_info->dr_aux.stmt = stmt_info;
4231 
4232       /* Check that analysis of the data-ref succeeded.  */
4233       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4234 	  || !DR_STEP (dr))
4235         {
4236 	  bool maybe_gather
4237 	    = DR_IS_READ (dr)
4238 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4239 	      && (targetm.vectorize.builtin_gather != NULL
4240 		  || supports_vec_gather_load_p ());
4241 	  bool maybe_scatter
4242 	    = DR_IS_WRITE (dr)
4243 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4244 	      && (targetm.vectorize.builtin_scatter != NULL
4245 		  || supports_vec_scatter_store_p ());
4246 
4247 	  /* If target supports vector gather loads or scatter stores,
4248 	     see if they can't be used.  */
4249 	  if (is_a <loop_vec_info> (vinfo)
4250 	      && !nested_in_vect_loop_p (loop, stmt_info))
4251 	    {
4252 	      if (maybe_gather || maybe_scatter)
4253 		{
4254 		  if (maybe_gather)
4255 		    gatherscatter = GATHER;
4256 		  else
4257 		    gatherscatter = SCATTER;
4258 		}
4259 	    }
4260 
4261 	  if (gatherscatter == SG_NONE)
4262 	    {
4263 	      if (dump_enabled_p ())
4264 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 				 "not vectorized: data ref analysis "
4266 				 "failed %G", stmt_info->stmt);
4267 	      if (is_a <bb_vec_info> (vinfo))
4268 		{
4269 		  /* In BB vectorization the ref can still participate
4270 		     in dependence analysis, we just can't vectorize it.  */
4271 		  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4272 		  continue;
4273 		}
4274 	      return opt_result::failure_at (stmt_info->stmt,
4275 					     "not vectorized:"
4276 					     " data ref analysis failed: %G",
4277 					     stmt_info->stmt);
4278 	    }
4279         }
4280 
4281       /* See if this was detected as SIMD lane access.  */
4282       if (dr->aux == (void *)-1
4283 	  || dr->aux == (void *)-2
4284 	  || dr->aux == (void *)-3
4285 	  || dr->aux == (void *)-4)
4286 	{
4287 	  if (nested_in_vect_loop_p (loop, stmt_info))
4288 	    return opt_result::failure_at (stmt_info->stmt,
4289 					   "not vectorized:"
4290 					   " data ref analysis failed: %G",
4291 					   stmt_info->stmt);
4292 	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4293 	    = -(uintptr_t) dr->aux;
4294 	}
4295 
4296       tree base = get_base_address (DR_REF (dr));
4297       if (base && VAR_P (base) && DECL_NONALIASED (base))
4298 	{
4299           if (dump_enabled_p ())
4300 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4301 			     "not vectorized: base object not addressable "
4302 			     "for stmt: %G", stmt_info->stmt);
4303           if (is_a <bb_vec_info> (vinfo))
4304 	    {
4305 	      /* In BB vectorization the ref can still participate
4306 	         in dependence analysis, we just can't vectorize it.  */
4307 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4308 	      continue;
4309 	    }
4310 	  return opt_result::failure_at (stmt_info->stmt,
4311 					 "not vectorized: base object not"
4312 					 " addressable for stmt: %G",
4313 					 stmt_info->stmt);
4314 	}
4315 
4316       if (is_a <loop_vec_info> (vinfo)
4317 	  && DR_STEP (dr)
4318 	  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4319 	{
4320 	  if (nested_in_vect_loop_p (loop, stmt_info))
4321 	    return opt_result::failure_at (stmt_info->stmt,
4322 					   "not vectorized: "
4323 					   "not suitable for strided load %G",
4324 					   stmt_info->stmt);
4325 	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4326 	}
4327 
4328       /* Update DR field in stmt_vec_info struct.  */
4329 
4330       /* If the dataref is in an inner-loop of the loop that is considered for
4331 	 for vectorization, we also want to analyze the access relative to
4332 	 the outer-loop (DR contains information only relative to the
4333 	 inner-most enclosing loop).  We do that by building a reference to the
4334 	 first location accessed by the inner-loop, and analyze it relative to
4335 	 the outer-loop.  */
4336       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4337 	{
4338 	  /* Build a reference to the first location accessed by the
4339 	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4340 	     this address must be invariant in the inner loop, so we
4341 	     can consider it as being used in the outer loop.  */
4342 	  tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4343 	  tree offset = unshare_expr (DR_OFFSET (dr));
4344 	  tree init = unshare_expr (DR_INIT (dr));
4345 	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4346 					  init, offset);
4347 	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4348 	  tree init_ref = build_fold_indirect_ref (init_addr);
4349 
4350 	  if (dump_enabled_p ())
4351 	    dump_printf_loc (MSG_NOTE, vect_location,
4352 			     "analyze in outer loop: %T\n", init_ref);
4353 
4354 	  opt_result res
4355 	    = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4356 				    init_ref, loop, stmt_info->stmt);
4357 	  if (!res)
4358 	    /* dr_analyze_innermost already explained the failure.  */
4359 	    return res;
4360 
4361           if (dump_enabled_p ())
4362 	    dump_printf_loc (MSG_NOTE, vect_location,
4363 			     "\touter base_address: %T\n"
4364 			     "\touter offset from base address: %T\n"
4365 			     "\touter constant offset from base address: %T\n"
4366 			     "\touter step: %T\n"
4367 			     "\touter base alignment: %d\n\n"
4368 			     "\touter base misalignment: %d\n"
4369 			     "\touter offset alignment: %d\n"
4370 			     "\touter step alignment: %d\n",
4371 			     STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4372 			     STMT_VINFO_DR_OFFSET (stmt_info),
4373 			     STMT_VINFO_DR_INIT (stmt_info),
4374 			     STMT_VINFO_DR_STEP (stmt_info),
4375 			     STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4376 			     STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4377 			     STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4378 			     STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4379 	}
4380 
4381       /* Set vectype for STMT.  */
4382       scalar_type = TREE_TYPE (DR_REF (dr));
4383       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4384       if (!vectype)
4385         {
4386           if (dump_enabled_p ())
4387             {
4388               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4389                                "not vectorized: no vectype for stmt: %G",
4390 			       stmt_info->stmt);
4391               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4392               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4393                                  scalar_type);
4394               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4395             }
4396 
4397           if (is_a <bb_vec_info> (vinfo))
4398 	    {
4399 	      /* No vector type is fine, the ref can still participate
4400 	         in dependence analysis, we just can't vectorize it.  */
4401 	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4402 	      continue;
4403 	    }
4404 	  if (fatal)
4405 	    *fatal = false;
4406 	  return opt_result::failure_at (stmt_info->stmt,
4407 					 "not vectorized:"
4408 					 " no vectype for stmt: %G"
4409 					 " scalar_type: %T\n",
4410 					 stmt_info->stmt, scalar_type);
4411         }
4412       else
4413 	{
4414 	  if (dump_enabled_p ())
4415 	    dump_printf_loc (MSG_NOTE, vect_location,
4416 			     "got vectype for stmt: %G%T\n",
4417 			     stmt_info->stmt, vectype);
4418 	}
4419 
4420       /* Adjust the minimal vectorization factor according to the
4421 	 vector type.  */
4422       vf = TYPE_VECTOR_SUBPARTS (vectype);
4423       *min_vf = upper_bound (*min_vf, vf);
4424 
4425       /* Leave the BB vectorizer to pick the vector type later, based on
4426 	 the final dataref group size and SLP node size.  */
4427       if (is_a <loop_vec_info> (vinfo))
4428 	STMT_VINFO_VECTYPE (stmt_info) = vectype;
4429 
4430       if (gatherscatter != SG_NONE)
4431 	{
4432 	  gather_scatter_info gs_info;
4433 	  if (!vect_check_gather_scatter (stmt_info,
4434 					  as_a <loop_vec_info> (vinfo),
4435 					  &gs_info)
4436 	      || !get_vectype_for_scalar_type (vinfo,
4437 					       TREE_TYPE (gs_info.offset)))
4438 	    {
4439 	      if (fatal)
4440 		*fatal = false;
4441 	      return opt_result::failure_at
4442 			(stmt_info->stmt,
4443 			 (gatherscatter == GATHER)
4444 			 ? "not vectorized: not suitable for gather load %G"
4445 			 : "not vectorized: not suitable for scatter store %G",
4446 			 stmt_info->stmt);
4447 	    }
4448 	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4449 	}
4450     }
4451 
4452   /* We used to stop processing and prune the list here.  Verify we no
4453      longer need to.  */
4454   gcc_assert (i == datarefs.length ());
4455 
4456   return opt_result::success ();
4457 }
4458 
4459 
4460 /* Function vect_get_new_vect_var.
4461 
4462    Returns a name for a new variable.  The current naming scheme appends the
4463    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4464    the name of vectorizer generated variables, and appends that to NAME if
4465    provided.  */
4466 
4467 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4468 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4469 {
4470   const char *prefix;
4471   tree new_vect_var;
4472 
4473   switch (var_kind)
4474   {
4475   case vect_simple_var:
4476     prefix = "vect";
4477     break;
4478   case vect_scalar_var:
4479     prefix = "stmp";
4480     break;
4481   case vect_mask_var:
4482     prefix = "mask";
4483     break;
4484   case vect_pointer_var:
4485     prefix = "vectp";
4486     break;
4487   default:
4488     gcc_unreachable ();
4489   }
4490 
4491   if (name)
4492     {
4493       char* tmp = concat (prefix, "_", name, NULL);
4494       new_vect_var = create_tmp_reg (type, tmp);
4495       free (tmp);
4496     }
4497   else
4498     new_vect_var = create_tmp_reg (type, prefix);
4499 
4500   return new_vect_var;
4501 }
4502 
4503 /* Like vect_get_new_vect_var but return an SSA name.  */
4504 
4505 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4506 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4507 {
4508   const char *prefix;
4509   tree new_vect_var;
4510 
4511   switch (var_kind)
4512   {
4513   case vect_simple_var:
4514     prefix = "vect";
4515     break;
4516   case vect_scalar_var:
4517     prefix = "stmp";
4518     break;
4519   case vect_pointer_var:
4520     prefix = "vectp";
4521     break;
4522   default:
4523     gcc_unreachable ();
4524   }
4525 
4526   if (name)
4527     {
4528       char* tmp = concat (prefix, "_", name, NULL);
4529       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4530       free (tmp);
4531     }
4532   else
4533     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4534 
4535   return new_vect_var;
4536 }
4537 
4538 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4539 
4540 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4541 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4542 {
4543   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4544   int misalign = DR_MISALIGNMENT (dr_info);
4545   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4546     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4547   else
4548     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4549 			    known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4550 			    misalign);
4551 }
4552 
4553 /* Function vect_create_addr_base_for_vector_ref.
4554 
4555    Create an expression that computes the address of the first memory location
4556    that will be accessed for a data reference.
4557 
4558    Input:
4559    STMT_INFO: The statement containing the data reference.
4560    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4561    OFFSET: Optional. If supplied, it is be added to the initial address.
4562    LOOP:    Specify relative to which loop-nest should the address be computed.
4563             For example, when the dataref is in an inner-loop nested in an
4564 	    outer-loop that is now being vectorized, LOOP can be either the
4565 	    outer-loop, or the inner-loop.  The first memory location accessed
4566 	    by the following dataref ('in' points to short):
4567 
4568 		for (i=0; i<N; i++)
4569 		   for (j=0; j<M; j++)
4570 		     s += in[i+j]
4571 
4572 	    is as follows:
4573 	    if LOOP=i_loop:	&in		(relative to i_loop)
4574 	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4575    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4576 	    initial address.  Unlike OFFSET, which is number of elements to
4577 	    be added, BYTE_OFFSET is measured in bytes.
4578 
4579    Output:
4580    1. Return an SSA_NAME whose value is the address of the memory location of
4581       the first vector of the data reference.
4582    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4583       these statement(s) which define the returned SSA_NAME.
4584 
4585    FORNOW: We are only handling array accesses with step 1.  */
4586 
4587 tree
vect_create_addr_base_for_vector_ref(stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4588 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4589 				      gimple_seq *new_stmt_list,
4590 				      tree offset,
4591 				      tree byte_offset)
4592 {
4593   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4594   struct data_reference *dr = dr_info->dr;
4595   const char *base_name;
4596   tree addr_base;
4597   tree dest;
4598   gimple_seq seq = NULL;
4599   tree vect_ptr_type;
4600   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4601   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4602   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4603 
4604   tree data_ref_base = unshare_expr (drb->base_address);
4605   tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true));
4606   tree init = unshare_expr (drb->init);
4607 
4608   if (loop_vinfo)
4609     base_name = get_name (data_ref_base);
4610   else
4611     {
4612       base_offset = ssize_int (0);
4613       init = ssize_int (0);
4614       base_name = get_name (DR_REF (dr));
4615     }
4616 
4617   /* Create base_offset */
4618   base_offset = size_binop (PLUS_EXPR,
4619 			    fold_convert (sizetype, base_offset),
4620 			    fold_convert (sizetype, init));
4621 
4622   if (offset)
4623     {
4624       offset = fold_build2 (MULT_EXPR, sizetype,
4625 			    fold_convert (sizetype, offset), step);
4626       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4627 				 base_offset, offset);
4628     }
4629   if (byte_offset)
4630     {
4631       byte_offset = fold_convert (sizetype, byte_offset);
4632       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4633 				 base_offset, byte_offset);
4634     }
4635 
4636   /* base + base_offset */
4637   if (loop_vinfo)
4638     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4639   else
4640     {
4641       addr_base = build1 (ADDR_EXPR,
4642 			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4643 			  unshare_expr (DR_REF (dr)));
4644     }
4645 
4646   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4647   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4648   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4649   gimple_seq_add_seq (new_stmt_list, seq);
4650 
4651   if (DR_PTR_INFO (dr)
4652       && TREE_CODE (addr_base) == SSA_NAME
4653       /* We should only duplicate pointer info to newly created SSA names.  */
4654       && SSA_NAME_VAR (addr_base) == dest)
4655     {
4656       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4657       if (offset || byte_offset)
4658 	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4659     }
4660 
4661   if (dump_enabled_p ())
4662     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4663 
4664   return addr_base;
4665 }
4666 
4667 
4668 /* Function vect_create_data_ref_ptr.
4669 
4670    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4671    location accessed in the loop by STMT_INFO, along with the def-use update
4672    chain to appropriately advance the pointer through the loop iterations.
4673    Also set aliasing information for the pointer.  This pointer is used by
4674    the callers to this function to create a memory reference expression for
4675    vector load/store access.
4676 
4677    Input:
4678    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4679          GIMPLE_ASSIGN <name, data-ref> or
4680 	 GIMPLE_ASSIGN <data-ref, name>.
4681    2. AGGR_TYPE: the type of the reference, which should be either a vector
4682         or an array.
4683    3. AT_LOOP: the loop where the vector memref is to be created.
4684    4. OFFSET (optional): an offset to be added to the initial address accessed
4685 	by the data-ref in STMT_INFO.
4686    5. BSI: location where the new stmts are to be placed if there is no loop
4687    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4688         pointing to the initial address.
4689    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4690 	to the initial address accessed by the data-ref in STMT_INFO.  This is
4691 	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4692 	in bytes.
4693    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4694 	to the IV during each iteration of the loop.  NULL says to move
4695 	by one copy of AGGR_TYPE up or down, depending on the step of the
4696 	data reference.
4697 
4698    Output:
4699    1. Declare a new ptr to vector_type, and have it point to the base of the
4700       data reference (initial addressed accessed by the data reference).
4701       For example, for vector of type V8HI, the following code is generated:
4702 
4703       v8hi *ap;
4704       ap = (v8hi *)initial_address;
4705 
4706       if OFFSET is not supplied:
4707          initial_address = &a[init];
4708       if OFFSET is supplied:
4709          initial_address = &a[init + OFFSET];
4710       if BYTE_OFFSET is supplied:
4711 	 initial_address = &a[init] + BYTE_OFFSET;
4712 
4713       Return the initial_address in INITIAL_ADDRESS.
4714 
4715    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4716       update the pointer in each iteration of the loop.
4717 
4718       Return the increment stmt that updates the pointer in PTR_INCR.
4719 
4720    3. Return the pointer.  */
4721 
4722 tree
vect_create_data_ref_ptr(stmt_vec_info stmt_info,tree aggr_type,class loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree byte_offset,tree iv_step)4723 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4724 			  class loop *at_loop, tree offset,
4725 			  tree *initial_address, gimple_stmt_iterator *gsi,
4726 			  gimple **ptr_incr, bool only_init,
4727 			  tree byte_offset, tree iv_step)
4728 {
4729   const char *base_name;
4730   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4731   class loop *loop = NULL;
4732   bool nested_in_vect_loop = false;
4733   class loop *containing_loop = NULL;
4734   tree aggr_ptr_type;
4735   tree aggr_ptr;
4736   tree new_temp;
4737   gimple_seq new_stmt_list = NULL;
4738   edge pe = NULL;
4739   basic_block new_bb;
4740   tree aggr_ptr_init;
4741   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4742   struct data_reference *dr = dr_info->dr;
4743   tree aptr;
4744   gimple_stmt_iterator incr_gsi;
4745   bool insert_after;
4746   tree indx_before_incr, indx_after_incr;
4747   gimple *incr;
4748   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4749 
4750   gcc_assert (iv_step != NULL_TREE
4751 	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4752 	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4753 
4754   if (loop_vinfo)
4755     {
4756       loop = LOOP_VINFO_LOOP (loop_vinfo);
4757       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4758       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4759       pe = loop_preheader_edge (loop);
4760     }
4761   else
4762     {
4763       gcc_assert (bb_vinfo);
4764       only_init = true;
4765       *ptr_incr = NULL;
4766     }
4767 
4768   /* Create an expression for the first address accessed by this load
4769      in LOOP.  */
4770   base_name = get_name (DR_BASE_ADDRESS (dr));
4771 
4772   if (dump_enabled_p ())
4773     {
4774       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4775       dump_printf_loc (MSG_NOTE, vect_location,
4776                        "create %s-pointer variable to type: %T",
4777 		       get_tree_code_name (TREE_CODE (aggr_type)),
4778 		       aggr_type);
4779       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4780         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4781       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4782         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4783       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4784         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4785       else
4786         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4787       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4788     }
4789 
4790   /* (1) Create the new aggregate-pointer variable.
4791      Vector and array types inherit the alias set of their component
4792      type by default so we need to use a ref-all pointer if the data
4793      reference does not conflict with the created aggregated data
4794      reference because it is not addressable.  */
4795   bool need_ref_all = false;
4796   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4797 			      get_alias_set (DR_REF (dr))))
4798     need_ref_all = true;
4799   /* Likewise for any of the data references in the stmt group.  */
4800   else if (DR_GROUP_SIZE (stmt_info) > 1)
4801     {
4802       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4803       do
4804 	{
4805 	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4806 	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4807 				      get_alias_set (DR_REF (sdr))))
4808 	    {
4809 	      need_ref_all = true;
4810 	      break;
4811 	    }
4812 	  sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4813 	}
4814       while (sinfo);
4815     }
4816   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4817 					       need_ref_all);
4818   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4819 
4820 
4821   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4822      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4823      def-use update cycles for the pointer: one relative to the outer-loop
4824      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4825      to the inner-loop (which is the inner-most loop containing the dataref),
4826      and this is done be step (5) below.
4827 
4828      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4829      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4830      redundant.  Steps (3),(4) create the following:
4831 
4832 	vp0 = &base_addr;
4833 	LOOP:	vp1 = phi(vp0,vp2)
4834 		...
4835 		...
4836 		vp2 = vp1 + step
4837 		goto LOOP
4838 
4839      If there is an inner-loop nested in loop, then step (5) will also be
4840      applied, and an additional update in the inner-loop will be created:
4841 
4842 	vp0 = &base_addr;
4843 	LOOP:   vp1 = phi(vp0,vp2)
4844 		...
4845         inner:     vp3 = phi(vp1,vp4)
4846 	           vp4 = vp3 + inner_step
4847 	           if () goto inner
4848 		...
4849 		vp2 = vp1 + step
4850 		if () goto LOOP   */
4851 
4852   /* (2) Calculate the initial address of the aggregate-pointer, and set
4853      the aggregate-pointer to point to it before the loop.  */
4854 
4855   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4856 
4857   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4858 						   offset, byte_offset);
4859   if (new_stmt_list)
4860     {
4861       if (pe)
4862         {
4863           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4864           gcc_assert (!new_bb);
4865         }
4866       else
4867         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4868     }
4869 
4870   *initial_address = new_temp;
4871   aggr_ptr_init = new_temp;
4872 
4873   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4874      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4875      inner-loop nested in LOOP (during outer-loop vectorization).  */
4876 
4877   /* No update in loop is required.  */
4878   if (only_init && (!loop_vinfo || at_loop == loop))
4879     aptr = aggr_ptr_init;
4880   else
4881     {
4882       /* Accesses to invariant addresses should be handled specially
4883 	 by the caller.  */
4884       tree step = vect_dr_behavior (dr_info)->step;
4885       gcc_assert (!integer_zerop (step));
4886 
4887       if (iv_step == NULL_TREE)
4888 	{
4889 	  /* The step of the aggregate pointer is the type size,
4890 	     negated for downward accesses.  */
4891 	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4892 	  if (tree_int_cst_sgn (step) == -1)
4893 	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4894 	}
4895 
4896       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4897 
4898       create_iv (aggr_ptr_init,
4899 		 fold_convert (aggr_ptr_type, iv_step),
4900 		 aggr_ptr, loop, &incr_gsi, insert_after,
4901 		 &indx_before_incr, &indx_after_incr);
4902       incr = gsi_stmt (incr_gsi);
4903       loop_vinfo->add_stmt (incr);
4904 
4905       /* Copy the points-to information if it exists. */
4906       if (DR_PTR_INFO (dr))
4907 	{
4908 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4909 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4910 	}
4911       if (ptr_incr)
4912 	*ptr_incr = incr;
4913 
4914       aptr = indx_before_incr;
4915     }
4916 
4917   if (!nested_in_vect_loop || only_init)
4918     return aptr;
4919 
4920 
4921   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4922      nested in LOOP, if exists.  */
4923 
4924   gcc_assert (nested_in_vect_loop);
4925   if (!only_init)
4926     {
4927       standard_iv_increment_position (containing_loop, &incr_gsi,
4928 				      &insert_after);
4929       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4930 		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4931 		 &indx_after_incr);
4932       incr = gsi_stmt (incr_gsi);
4933       loop_vinfo->add_stmt (incr);
4934 
4935       /* Copy the points-to information if it exists. */
4936       if (DR_PTR_INFO (dr))
4937 	{
4938 	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4939 	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4940 	}
4941       if (ptr_incr)
4942 	*ptr_incr = incr;
4943 
4944       return indx_before_incr;
4945     }
4946   else
4947     gcc_unreachable ();
4948 }
4949 
4950 
4951 /* Function bump_vector_ptr
4952 
4953    Increment a pointer (to a vector type) by vector-size. If requested,
4954    i.e. if PTR-INCR is given, then also connect the new increment stmt
4955    to the existing def-use update-chain of the pointer, by modifying
4956    the PTR_INCR as illustrated below:
4957 
4958    The pointer def-use update-chain before this function:
4959                         DATAREF_PTR = phi (p_0, p_2)
4960                         ....
4961         PTR_INCR:       p_2 = DATAREF_PTR + step
4962 
4963    The pointer def-use update-chain after this function:
4964                         DATAREF_PTR = phi (p_0, p_2)
4965                         ....
4966                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4967                         ....
4968         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4969 
4970    Input:
4971    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4972                  in the loop.
4973    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4974 	      the loop.  The increment amount across iterations is expected
4975 	      to be vector_size.
4976    BSI - location where the new update stmt is to be placed.
4977    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4978    BUMP - optional. The offset by which to bump the pointer. If not given,
4979 	  the offset is assumed to be vector_size.
4980 
4981    Output: Return NEW_DATAREF_PTR as illustrated above.
4982 
4983 */
4984 
4985 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)4986 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4987 		 stmt_vec_info stmt_info, tree bump)
4988 {
4989   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4990   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4991   tree update = TYPE_SIZE_UNIT (vectype);
4992   gassign *incr_stmt;
4993   ssa_op_iter iter;
4994   use_operand_p use_p;
4995   tree new_dataref_ptr;
4996 
4997   if (bump)
4998     update = bump;
4999 
5000   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5001     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5002   else
5003     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5004   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5005 				   dataref_ptr, update);
5006   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
5007 
5008   /* Copy the points-to information if it exists. */
5009   if (DR_PTR_INFO (dr))
5010     {
5011       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5012       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5013     }
5014 
5015   if (!ptr_incr)
5016     return new_dataref_ptr;
5017 
5018   /* Update the vector-pointer's cross-iteration increment.  */
5019   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5020     {
5021       tree use = USE_FROM_PTR (use_p);
5022 
5023       if (use == dataref_ptr)
5024         SET_USE (use_p, new_dataref_ptr);
5025       else
5026         gcc_assert (operand_equal_p (use, update, 0));
5027     }
5028 
5029   return new_dataref_ptr;
5030 }
5031 
5032 
5033 /* Copy memory reference info such as base/clique from the SRC reference
5034    to the DEST MEM_REF.  */
5035 
5036 void
vect_copy_ref_info(tree dest,tree src)5037 vect_copy_ref_info (tree dest, tree src)
5038 {
5039   if (TREE_CODE (dest) != MEM_REF)
5040     return;
5041 
5042   tree src_base = src;
5043   while (handled_component_p (src_base))
5044     src_base = TREE_OPERAND (src_base, 0);
5045   if (TREE_CODE (src_base) != MEM_REF
5046       && TREE_CODE (src_base) != TARGET_MEM_REF)
5047     return;
5048 
5049   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5050   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5051 }
5052 
5053 
5054 /* Function vect_create_destination_var.
5055 
5056    Create a new temporary of type VECTYPE.  */
5057 
5058 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5059 vect_create_destination_var (tree scalar_dest, tree vectype)
5060 {
5061   tree vec_dest;
5062   const char *name;
5063   char *new_name;
5064   tree type;
5065   enum vect_var_kind kind;
5066 
5067   kind = vectype
5068     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5069     ? vect_mask_var
5070     : vect_simple_var
5071     : vect_scalar_var;
5072   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5073 
5074   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5075 
5076   name = get_name (scalar_dest);
5077   if (name)
5078     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5079   else
5080     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5081   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5082   free (new_name);
5083 
5084   return vec_dest;
5085 }
5086 
5087 /* Function vect_grouped_store_supported.
5088 
5089    Returns TRUE if interleave high and interleave low permutations
5090    are supported, and FALSE otherwise.  */
5091 
5092 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5093 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5094 {
5095   machine_mode mode = TYPE_MODE (vectype);
5096 
5097   /* vect_permute_store_chain requires the group size to be equal to 3 or
5098      be a power of two.  */
5099   if (count != 3 && exact_log2 (count) == -1)
5100     {
5101       if (dump_enabled_p ())
5102 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5103 			 "the size of the group of accesses"
5104 			 " is not a power of 2 or not eqaul to 3\n");
5105       return false;
5106     }
5107 
5108   /* Check that the permutation is supported.  */
5109   if (VECTOR_MODE_P (mode))
5110     {
5111       unsigned int i;
5112       if (count == 3)
5113 	{
5114 	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5115 	  unsigned int i, j;
5116 
5117 	  unsigned int nelt;
5118 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5119 	    {
5120 	      if (dump_enabled_p ())
5121 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122 				 "cannot handle groups of 3 stores for"
5123 				 " variable-length vectors\n");
5124 	      return false;
5125 	    }
5126 
5127 	  vec_perm_builder sel (nelt, nelt, 1);
5128 	  sel.quick_grow (nelt);
5129 	  vec_perm_indices indices;
5130 	  for (j = 0; j < 3; j++)
5131 	    {
5132 	      int nelt0 = ((3 - j) * nelt) % 3;
5133 	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5134 	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5135 	      for (i = 0; i < nelt; i++)
5136 		{
5137 		  if (3 * i + nelt0 < nelt)
5138 		    sel[3 * i + nelt0] = j0++;
5139 		  if (3 * i + nelt1 < nelt)
5140 		    sel[3 * i + nelt1] = nelt + j1++;
5141 		  if (3 * i + nelt2 < nelt)
5142 		    sel[3 * i + nelt2] = 0;
5143 		}
5144 	      indices.new_vector (sel, 2, nelt);
5145 	      if (!can_vec_perm_const_p (mode, indices))
5146 		{
5147 		  if (dump_enabled_p ())
5148 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5149 				 "permutation op not supported by target.\n");
5150 		  return false;
5151 		}
5152 
5153 	      for (i = 0; i < nelt; i++)
5154 		{
5155 		  if (3 * i + nelt0 < nelt)
5156 		    sel[3 * i + nelt0] = 3 * i + nelt0;
5157 		  if (3 * i + nelt1 < nelt)
5158 		    sel[3 * i + nelt1] = 3 * i + nelt1;
5159 		  if (3 * i + nelt2 < nelt)
5160 		    sel[3 * i + nelt2] = nelt + j2++;
5161 		}
5162 	      indices.new_vector (sel, 2, nelt);
5163 	      if (!can_vec_perm_const_p (mode, indices))
5164 		{
5165 		  if (dump_enabled_p ())
5166 		    dump_printf (MSG_MISSED_OPTIMIZATION,
5167 				 "permutation op not supported by target.\n");
5168 		  return false;
5169 		}
5170 	    }
5171 	  return true;
5172 	}
5173       else
5174 	{
5175 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5176 	  gcc_assert (pow2p_hwi (count));
5177 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5178 
5179 	  /* The encoding has 2 interleaved stepped patterns.  */
5180 	  vec_perm_builder sel (nelt, 2, 3);
5181 	  sel.quick_grow (6);
5182 	  for (i = 0; i < 3; i++)
5183 	    {
5184 	      sel[i * 2] = i;
5185 	      sel[i * 2 + 1] = i + nelt;
5186 	    }
5187 	  vec_perm_indices indices (sel, 2, nelt);
5188 	  if (can_vec_perm_const_p (mode, indices))
5189 	    {
5190 	      for (i = 0; i < 6; i++)
5191 		sel[i] += exact_div (nelt, 2);
5192 	      indices.new_vector (sel, 2, nelt);
5193 	      if (can_vec_perm_const_p (mode, indices))
5194 		return true;
5195 	    }
5196 	}
5197     }
5198 
5199   if (dump_enabled_p ())
5200     dump_printf (MSG_MISSED_OPTIMIZATION,
5201 		 "permutation op not supported by target.\n");
5202   return false;
5203 }
5204 
5205 
5206 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5207    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5208 
5209 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5210 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5211 			    bool masked_p)
5212 {
5213   if (masked_p)
5214     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5215 					 vec_mask_store_lanes_optab,
5216 					 vectype, count);
5217   else
5218     return vect_lanes_optab_supported_p ("vec_store_lanes",
5219 					 vec_store_lanes_optab,
5220 					 vectype, count);
5221 }
5222 
5223 
5224 /* Function vect_permute_store_chain.
5225 
5226    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5227    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5228    the data correctly for the stores.  Return the final references for stores
5229    in RESULT_CHAIN.
5230 
5231    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5232    The input is 4 vectors each containing 8 elements.  We assign a number to
5233    each element, the input sequence is:
5234 
5235    1st vec:   0  1  2  3  4  5  6  7
5236    2nd vec:   8  9 10 11 12 13 14 15
5237    3rd vec:  16 17 18 19 20 21 22 23
5238    4th vec:  24 25 26 27 28 29 30 31
5239 
5240    The output sequence should be:
5241 
5242    1st vec:  0  8 16 24  1  9 17 25
5243    2nd vec:  2 10 18 26  3 11 19 27
5244    3rd vec:  4 12 20 28  5 13 21 30
5245    4th vec:  6 14 22 30  7 15 23 31
5246 
5247    i.e., we interleave the contents of the four vectors in their order.
5248 
5249    We use interleave_high/low instructions to create such output.  The input of
5250    each interleave_high/low operation is two vectors:
5251    1st vec    2nd vec
5252    0 1 2 3    4 5 6 7
5253    the even elements of the result vector are obtained left-to-right from the
5254    high/low elements of the first vector.  The odd elements of the result are
5255    obtained left-to-right from the high/low elements of the second vector.
5256    The output of interleave_high will be:   0 4 1 5
5257    and of interleave_low:                   2 6 3 7
5258 
5259 
5260    The permutation is done in log LENGTH stages.  In each stage interleave_high
5261    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5262    where the first argument is taken from the first half of DR_CHAIN and the
5263    second argument from it's second half.
5264    In our example,
5265 
5266    I1: interleave_high (1st vec, 3rd vec)
5267    I2: interleave_low (1st vec, 3rd vec)
5268    I3: interleave_high (2nd vec, 4th vec)
5269    I4: interleave_low (2nd vec, 4th vec)
5270 
5271    The output for the first stage is:
5272 
5273    I1:  0 16  1 17  2 18  3 19
5274    I2:  4 20  5 21  6 22  7 23
5275    I3:  8 24  9 25 10 26 11 27
5276    I4: 12 28 13 29 14 30 15 31
5277 
5278    The output of the second stage, i.e. the final result is:
5279 
5280    I1:  0  8 16 24  1  9 17 25
5281    I2:  2 10 18 26  3 11 19 27
5282    I3:  4 12 20 28  5 13 21 30
5283    I4:  6 14 22 30  7 15 23 31.  */
5284 
5285 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5286 vect_permute_store_chain (vec<tree> dr_chain,
5287 			  unsigned int length,
5288 			  stmt_vec_info stmt_info,
5289 			  gimple_stmt_iterator *gsi,
5290 			  vec<tree> *result_chain)
5291 {
5292   tree vect1, vect2, high, low;
5293   gimple *perm_stmt;
5294   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5295   tree perm_mask_low, perm_mask_high;
5296   tree data_ref;
5297   tree perm3_mask_low, perm3_mask_high;
5298   unsigned int i, j, n, log_length = exact_log2 (length);
5299 
5300   result_chain->quick_grow (length);
5301   memcpy (result_chain->address (), dr_chain.address (),
5302 	  length * sizeof (tree));
5303 
5304   if (length == 3)
5305     {
5306       /* vect_grouped_store_supported ensures that this is constant.  */
5307       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5308       unsigned int j0 = 0, j1 = 0, j2 = 0;
5309 
5310       vec_perm_builder sel (nelt, nelt, 1);
5311       sel.quick_grow (nelt);
5312       vec_perm_indices indices;
5313       for (j = 0; j < 3; j++)
5314         {
5315 	  int nelt0 = ((3 - j) * nelt) % 3;
5316 	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5317 	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5318 
5319 	  for (i = 0; i < nelt; i++)
5320 	    {
5321 	      if (3 * i + nelt0 < nelt)
5322 		sel[3 * i + nelt0] = j0++;
5323 	      if (3 * i + nelt1 < nelt)
5324 		sel[3 * i + nelt1] = nelt + j1++;
5325 	      if (3 * i + nelt2 < nelt)
5326 		sel[3 * i + nelt2] = 0;
5327 	    }
5328 	  indices.new_vector (sel, 2, nelt);
5329 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5330 
5331 	  for (i = 0; i < nelt; i++)
5332 	    {
5333 	      if (3 * i + nelt0 < nelt)
5334 		sel[3 * i + nelt0] = 3 * i + nelt0;
5335 	      if (3 * i + nelt1 < nelt)
5336 		sel[3 * i + nelt1] = 3 * i + nelt1;
5337 	      if (3 * i + nelt2 < nelt)
5338 		sel[3 * i + nelt2] = nelt + j2++;
5339 	    }
5340 	  indices.new_vector (sel, 2, nelt);
5341 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342 
5343 	  vect1 = dr_chain[0];
5344 	  vect2 = dr_chain[1];
5345 
5346 	  /* Create interleaving stmt:
5347 	     low = VEC_PERM_EXPR <vect1, vect2,
5348 				  {j, nelt, *, j + 1, nelt + j + 1, *,
5349 				   j + 2, nelt + j + 2, *, ...}>  */
5350 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5351 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5352 					   vect2, perm3_mask_low);
5353 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5354 
5355 	  vect1 = data_ref;
5356 	  vect2 = dr_chain[2];
5357 	  /* Create interleaving stmt:
5358 	     low = VEC_PERM_EXPR <vect1, vect2,
5359 				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5360 				   6, 7, nelt + j + 2, ...}>  */
5361 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5362 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5363 					   vect2, perm3_mask_high);
5364 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5365 	  (*result_chain)[j] = data_ref;
5366 	}
5367     }
5368   else
5369     {
5370       /* If length is not equal to 3 then only power of 2 is supported.  */
5371       gcc_assert (pow2p_hwi (length));
5372 
5373       /* The encoding has 2 interleaved stepped patterns.  */
5374       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5375       vec_perm_builder sel (nelt, 2, 3);
5376       sel.quick_grow (6);
5377       for (i = 0; i < 3; i++)
5378 	{
5379 	  sel[i * 2] = i;
5380 	  sel[i * 2 + 1] = i + nelt;
5381 	}
5382 	vec_perm_indices indices (sel, 2, nelt);
5383 	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5384 
5385 	for (i = 0; i < 6; i++)
5386 	  sel[i] += exact_div (nelt, 2);
5387 	indices.new_vector (sel, 2, nelt);
5388 	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5389 
5390 	for (i = 0, n = log_length; i < n; i++)
5391 	  {
5392 	    for (j = 0; j < length/2; j++)
5393 	      {
5394 		vect1 = dr_chain[j];
5395 		vect2 = dr_chain[j+length/2];
5396 
5397 		/* Create interleaving stmt:
5398 		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5399 							...}>  */
5400 		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5401 		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5402 						 vect2, perm_mask_high);
5403 		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5404 		(*result_chain)[2*j] = high;
5405 
5406 		/* Create interleaving stmt:
5407 		   low = VEC_PERM_EXPR <vect1, vect2,
5408 					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5409 					 ...}>  */
5410 		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5411 		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5412 						 vect2, perm_mask_low);
5413 		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5414 		(*result_chain)[2*j+1] = low;
5415 	      }
5416 	    memcpy (dr_chain.address (), result_chain->address (),
5417 		    length * sizeof (tree));
5418 	  }
5419     }
5420 }
5421 
5422 /* Function vect_setup_realignment
5423 
5424    This function is called when vectorizing an unaligned load using
5425    the dr_explicit_realign[_optimized] scheme.
5426    This function generates the following code at the loop prolog:
5427 
5428       p = initial_addr;
5429    x  msq_init = *(floor(p));   # prolog load
5430       realignment_token = call target_builtin;
5431     loop:
5432    x  msq = phi (msq_init, ---)
5433 
5434    The stmts marked with x are generated only for the case of
5435    dr_explicit_realign_optimized.
5436 
5437    The code above sets up a new (vector) pointer, pointing to the first
5438    location accessed by STMT_INFO, and a "floor-aligned" load using that
5439    pointer.  It also generates code to compute the "realignment-token"
5440    (if the relevant target hook was defined), and creates a phi-node at the
5441    loop-header bb whose arguments are the result of the prolog-load (created
5442    by this function) and the result of a load that takes place in the loop
5443    (to be created by the caller to this function).
5444 
5445    For the case of dr_explicit_realign_optimized:
5446    The caller to this function uses the phi-result (msq) to create the
5447    realignment code inside the loop, and sets up the missing phi argument,
5448    as follows:
5449     loop:
5450       msq = phi (msq_init, lsq)
5451       lsq = *(floor(p'));        # load in loop
5452       result = realign_load (msq, lsq, realignment_token);
5453 
5454    For the case of dr_explicit_realign:
5455     loop:
5456       msq = *(floor(p)); 	# load in loop
5457       p' = p + (VS-1);
5458       lsq = *(floor(p'));	# load in loop
5459       result = realign_load (msq, lsq, realignment_token);
5460 
5461    Input:
5462    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5463 	       a memory location that may be unaligned.
5464    BSI - place where new code is to be inserted.
5465    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5466 			      is used.
5467 
5468    Output:
5469    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5470                        target hook, if defined.
5471    Return value - the result of the loop-header phi node.  */
5472 
5473 tree
vect_setup_realignment(stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,class loop ** at_loop)5474 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5475                         tree *realignment_token,
5476 			enum dr_alignment_support alignment_support_scheme,
5477 			tree init_addr,
5478 			class loop **at_loop)
5479 {
5480   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5481   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5482   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5483   struct data_reference *dr = dr_info->dr;
5484   class loop *loop = NULL;
5485   edge pe = NULL;
5486   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5487   tree vec_dest;
5488   gimple *inc;
5489   tree ptr;
5490   tree data_ref;
5491   basic_block new_bb;
5492   tree msq_init = NULL_TREE;
5493   tree new_temp;
5494   gphi *phi_stmt;
5495   tree msq = NULL_TREE;
5496   gimple_seq stmts = NULL;
5497   bool compute_in_loop = false;
5498   bool nested_in_vect_loop = false;
5499   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5500   class loop *loop_for_initial_load = NULL;
5501 
5502   if (loop_vinfo)
5503     {
5504       loop = LOOP_VINFO_LOOP (loop_vinfo);
5505       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5506     }
5507 
5508   gcc_assert (alignment_support_scheme == dr_explicit_realign
5509 	      || alignment_support_scheme == dr_explicit_realign_optimized);
5510 
5511   /* We need to generate three things:
5512      1. the misalignment computation
5513      2. the extra vector load (for the optimized realignment scheme).
5514      3. the phi node for the two vectors from which the realignment is
5515       done (for the optimized realignment scheme).  */
5516 
5517   /* 1. Determine where to generate the misalignment computation.
5518 
5519      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5520      calculation will be generated by this function, outside the loop (in the
5521      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5522      caller, inside the loop.
5523 
5524      Background: If the misalignment remains fixed throughout the iterations of
5525      the loop, then both realignment schemes are applicable, and also the
5526      misalignment computation can be done outside LOOP.  This is because we are
5527      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5528      are a multiple of VS (the Vector Size), and therefore the misalignment in
5529      different vectorized LOOP iterations is always the same.
5530      The problem arises only if the memory access is in an inner-loop nested
5531      inside LOOP, which is now being vectorized using outer-loop vectorization.
5532      This is the only case when the misalignment of the memory access may not
5533      remain fixed throughout the iterations of the inner-loop (as explained in
5534      detail in vect_supportable_dr_alignment).  In this case, not only is the
5535      optimized realignment scheme not applicable, but also the misalignment
5536      computation (and generation of the realignment token that is passed to
5537      REALIGN_LOAD) have to be done inside the loop.
5538 
5539      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5540      or not, which in turn determines if the misalignment is computed inside
5541      the inner-loop, or outside LOOP.  */
5542 
5543   if (init_addr != NULL_TREE || !loop_vinfo)
5544     {
5545       compute_in_loop = true;
5546       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5547     }
5548 
5549 
5550   /* 2. Determine where to generate the extra vector load.
5551 
5552      For the optimized realignment scheme, instead of generating two vector
5553      loads in each iteration, we generate a single extra vector load in the
5554      preheader of the loop, and in each iteration reuse the result of the
5555      vector load from the previous iteration.  In case the memory access is in
5556      an inner-loop nested inside LOOP, which is now being vectorized using
5557      outer-loop vectorization, we need to determine whether this initial vector
5558      load should be generated at the preheader of the inner-loop, or can be
5559      generated at the preheader of LOOP.  If the memory access has no evolution
5560      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5561      to be generated inside LOOP (in the preheader of the inner-loop).  */
5562 
5563   if (nested_in_vect_loop)
5564     {
5565       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5566       bool invariant_in_outerloop =
5567             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5568       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5569     }
5570   else
5571     loop_for_initial_load = loop;
5572   if (at_loop)
5573     *at_loop = loop_for_initial_load;
5574 
5575   if (loop_for_initial_load)
5576     pe = loop_preheader_edge (loop_for_initial_load);
5577 
5578   /* 3. For the case of the optimized realignment, create the first vector
5579       load at the loop preheader.  */
5580 
5581   if (alignment_support_scheme == dr_explicit_realign_optimized)
5582     {
5583       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5584       gassign *new_stmt;
5585 
5586       gcc_assert (!compute_in_loop);
5587       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5588       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5589 				      loop_for_initial_load, NULL_TREE,
5590 				      &init_addr, NULL, &inc, true);
5591       if (TREE_CODE (ptr) == SSA_NAME)
5592 	new_temp = copy_ssa_name (ptr);
5593       else
5594 	new_temp = make_ssa_name (TREE_TYPE (ptr));
5595       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5596       tree type = TREE_TYPE (ptr);
5597       new_stmt = gimple_build_assign
5598 		   (new_temp, BIT_AND_EXPR, ptr,
5599 		    fold_build2 (MINUS_EXPR, type,
5600 				 build_int_cst (type, 0),
5601 				 build_int_cst (type, align)));
5602       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5603       gcc_assert (!new_bb);
5604       data_ref
5605 	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5606 		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5607       vect_copy_ref_info (data_ref, DR_REF (dr));
5608       new_stmt = gimple_build_assign (vec_dest, data_ref);
5609       new_temp = make_ssa_name (vec_dest, new_stmt);
5610       gimple_assign_set_lhs (new_stmt, new_temp);
5611       if (pe)
5612         {
5613           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5614           gcc_assert (!new_bb);
5615         }
5616       else
5617          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5618 
5619       msq_init = gimple_assign_lhs (new_stmt);
5620     }
5621 
5622   /* 4. Create realignment token using a target builtin, if available.
5623       It is done either inside the containing loop, or before LOOP (as
5624       determined above).  */
5625 
5626   if (targetm.vectorize.builtin_mask_for_load)
5627     {
5628       gcall *new_stmt;
5629       tree builtin_decl;
5630 
5631       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5632       if (!init_addr)
5633 	{
5634 	  /* Generate the INIT_ADDR computation outside LOOP.  */
5635 	  init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5636 							    NULL_TREE);
5637           if (loop)
5638             {
5639    	      pe = loop_preheader_edge (loop);
5640 	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5641 	      gcc_assert (!new_bb);
5642             }
5643           else
5644              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5645 	}
5646 
5647       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5648       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5649       vec_dest =
5650 	vect_create_destination_var (scalar_dest,
5651 				     gimple_call_return_type (new_stmt));
5652       new_temp = make_ssa_name (vec_dest, new_stmt);
5653       gimple_call_set_lhs (new_stmt, new_temp);
5654 
5655       if (compute_in_loop)
5656 	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5657       else
5658 	{
5659 	  /* Generate the misalignment computation outside LOOP.  */
5660 	  pe = loop_preheader_edge (loop);
5661 	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5662 	  gcc_assert (!new_bb);
5663 	}
5664 
5665       *realignment_token = gimple_call_lhs (new_stmt);
5666 
5667       /* The result of the CALL_EXPR to this builtin is determined from
5668          the value of the parameter and no global variables are touched
5669          which makes the builtin a "const" function.  Requiring the
5670          builtin to have the "const" attribute makes it unnecessary
5671          to call mark_call_clobbered.  */
5672       gcc_assert (TREE_READONLY (builtin_decl));
5673     }
5674 
5675   if (alignment_support_scheme == dr_explicit_realign)
5676     return msq;
5677 
5678   gcc_assert (!compute_in_loop);
5679   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5680 
5681 
5682   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5683 
5684   pe = loop_preheader_edge (containing_loop);
5685   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5686   msq = make_ssa_name (vec_dest);
5687   phi_stmt = create_phi_node (msq, containing_loop->header);
5688   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5689 
5690   return msq;
5691 }
5692 
5693 
5694 /* Function vect_grouped_load_supported.
5695 
5696    COUNT is the size of the load group (the number of statements plus the
5697    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5698    only one statement, with a gap of COUNT - 1.
5699 
5700    Returns true if a suitable permute exists.  */
5701 
5702 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5703 vect_grouped_load_supported (tree vectype, bool single_element_p,
5704 			     unsigned HOST_WIDE_INT count)
5705 {
5706   machine_mode mode = TYPE_MODE (vectype);
5707 
5708   /* If this is single-element interleaving with an element distance
5709      that leaves unused vector loads around punt - we at least create
5710      very sub-optimal code in that case (and blow up memory,
5711      see PR65518).  */
5712   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5713     {
5714       if (dump_enabled_p ())
5715 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716 			 "single-element interleaving not supported "
5717 			 "for not adjacent vector loads\n");
5718       return false;
5719     }
5720 
5721   /* vect_permute_load_chain requires the group size to be equal to 3 or
5722      be a power of two.  */
5723   if (count != 3 && exact_log2 (count) == -1)
5724     {
5725       if (dump_enabled_p ())
5726 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727 			 "the size of the group of accesses"
5728 			 " is not a power of 2 or not equal to 3\n");
5729       return false;
5730     }
5731 
5732   /* Check that the permutation is supported.  */
5733   if (VECTOR_MODE_P (mode))
5734     {
5735       unsigned int i, j;
5736       if (count == 3)
5737 	{
5738 	  unsigned int nelt;
5739 	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5740 	    {
5741 	      if (dump_enabled_p ())
5742 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5743 				 "cannot handle groups of 3 loads for"
5744 				 " variable-length vectors\n");
5745 	      return false;
5746 	    }
5747 
5748 	  vec_perm_builder sel (nelt, nelt, 1);
5749 	  sel.quick_grow (nelt);
5750 	  vec_perm_indices indices;
5751 	  unsigned int k;
5752 	  for (k = 0; k < 3; k++)
5753 	    {
5754 	      for (i = 0; i < nelt; i++)
5755 		if (3 * i + k < 2 * nelt)
5756 		  sel[i] = 3 * i + k;
5757 		else
5758 		  sel[i] = 0;
5759 	      indices.new_vector (sel, 2, nelt);
5760 	      if (!can_vec_perm_const_p (mode, indices))
5761 		{
5762 		  if (dump_enabled_p ())
5763 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764 				     "shuffle of 3 loads is not supported by"
5765 				     " target\n");
5766 		  return false;
5767 		}
5768 	      for (i = 0, j = 0; i < nelt; i++)
5769 		if (3 * i + k < 2 * nelt)
5770 		  sel[i] = i;
5771 		else
5772 		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5773 	      indices.new_vector (sel, 2, nelt);
5774 	      if (!can_vec_perm_const_p (mode, indices))
5775 		{
5776 		  if (dump_enabled_p ())
5777 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778 				     "shuffle of 3 loads is not supported by"
5779 				     " target\n");
5780 		  return false;
5781 		}
5782 	    }
5783 	  return true;
5784 	}
5785       else
5786 	{
5787 	  /* If length is not equal to 3 then only power of 2 is supported.  */
5788 	  gcc_assert (pow2p_hwi (count));
5789 	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5790 
5791 	  /* The encoding has a single stepped pattern.  */
5792 	  vec_perm_builder sel (nelt, 1, 3);
5793 	  sel.quick_grow (3);
5794 	  for (i = 0; i < 3; i++)
5795 	    sel[i] = i * 2;
5796 	  vec_perm_indices indices (sel, 2, nelt);
5797 	  if (can_vec_perm_const_p (mode, indices))
5798 	    {
5799 	      for (i = 0; i < 3; i++)
5800 		sel[i] = i * 2 + 1;
5801 	      indices.new_vector (sel, 2, nelt);
5802 	      if (can_vec_perm_const_p (mode, indices))
5803 		return true;
5804 	    }
5805         }
5806     }
5807 
5808   if (dump_enabled_p ())
5809     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5810 		     "extract even/odd not supported by target\n");
5811   return false;
5812 }
5813 
5814 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5815    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5816 
5817 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5818 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5819 			   bool masked_p)
5820 {
5821   if (masked_p)
5822     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5823 					 vec_mask_load_lanes_optab,
5824 					 vectype, count);
5825   else
5826     return vect_lanes_optab_supported_p ("vec_load_lanes",
5827 					 vec_load_lanes_optab,
5828 					 vectype, count);
5829 }
5830 
5831 /* Function vect_permute_load_chain.
5832 
5833    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5834    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5835    the input data correctly.  Return the final references for loads in
5836    RESULT_CHAIN.
5837 
5838    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5839    The input is 4 vectors each containing 8 elements. We assign a number to each
5840    element, the input sequence is:
5841 
5842    1st vec:   0  1  2  3  4  5  6  7
5843    2nd vec:   8  9 10 11 12 13 14 15
5844    3rd vec:  16 17 18 19 20 21 22 23
5845    4th vec:  24 25 26 27 28 29 30 31
5846 
5847    The output sequence should be:
5848 
5849    1st vec:  0 4  8 12 16 20 24 28
5850    2nd vec:  1 5  9 13 17 21 25 29
5851    3rd vec:  2 6 10 14 18 22 26 30
5852    4th vec:  3 7 11 15 19 23 27 31
5853 
5854    i.e., the first output vector should contain the first elements of each
5855    interleaving group, etc.
5856 
5857    We use extract_even/odd instructions to create such output.  The input of
5858    each extract_even/odd operation is two vectors
5859    1st vec    2nd vec
5860    0 1 2 3    4 5 6 7
5861 
5862    and the output is the vector of extracted even/odd elements.  The output of
5863    extract_even will be:   0 2 4 6
5864    and of extract_odd:     1 3 5 7
5865 
5866 
5867    The permutation is done in log LENGTH stages.  In each stage extract_even
5868    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5869    their order.  In our example,
5870 
5871    E1: extract_even (1st vec, 2nd vec)
5872    E2: extract_odd (1st vec, 2nd vec)
5873    E3: extract_even (3rd vec, 4th vec)
5874    E4: extract_odd (3rd vec, 4th vec)
5875 
5876    The output for the first stage will be:
5877 
5878    E1:  0  2  4  6  8 10 12 14
5879    E2:  1  3  5  7  9 11 13 15
5880    E3: 16 18 20 22 24 26 28 30
5881    E4: 17 19 21 23 25 27 29 31
5882 
5883    In order to proceed and create the correct sequence for the next stage (or
5884    for the correct output, if the second stage is the last one, as in our
5885    example), we first put the output of extract_even operation and then the
5886    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5887    The input for the second stage is:
5888 
5889    1st vec (E1):  0  2  4  6  8 10 12 14
5890    2nd vec (E3): 16 18 20 22 24 26 28 30
5891    3rd vec (E2):  1  3  5  7  9 11 13 15
5892    4th vec (E4): 17 19 21 23 25 27 29 31
5893 
5894    The output of the second stage:
5895 
5896    E1: 0 4  8 12 16 20 24 28
5897    E2: 2 6 10 14 18 22 26 30
5898    E3: 1 5  9 13 17 21 25 29
5899    E4: 3 7 11 15 19 23 27 31
5900 
5901    And RESULT_CHAIN after reordering:
5902 
5903    1st vec (E1):  0 4  8 12 16 20 24 28
5904    2nd vec (E3):  1 5  9 13 17 21 25 29
5905    3rd vec (E2):  2 6 10 14 18 22 26 30
5906    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5907 
5908 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5909 vect_permute_load_chain (vec<tree> dr_chain,
5910 			 unsigned int length,
5911 			 stmt_vec_info stmt_info,
5912 			 gimple_stmt_iterator *gsi,
5913 			 vec<tree> *result_chain)
5914 {
5915   tree data_ref, first_vect, second_vect;
5916   tree perm_mask_even, perm_mask_odd;
5917   tree perm3_mask_low, perm3_mask_high;
5918   gimple *perm_stmt;
5919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5920   unsigned int i, j, log_length = exact_log2 (length);
5921 
5922   result_chain->quick_grow (length);
5923   memcpy (result_chain->address (), dr_chain.address (),
5924 	  length * sizeof (tree));
5925 
5926   if (length == 3)
5927     {
5928       /* vect_grouped_load_supported ensures that this is constant.  */
5929       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5930       unsigned int k;
5931 
5932       vec_perm_builder sel (nelt, nelt, 1);
5933       sel.quick_grow (nelt);
5934       vec_perm_indices indices;
5935       for (k = 0; k < 3; k++)
5936 	{
5937 	  for (i = 0; i < nelt; i++)
5938 	    if (3 * i + k < 2 * nelt)
5939 	      sel[i] = 3 * i + k;
5940 	    else
5941 	      sel[i] = 0;
5942 	  indices.new_vector (sel, 2, nelt);
5943 	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5944 
5945 	  for (i = 0, j = 0; i < nelt; i++)
5946 	    if (3 * i + k < 2 * nelt)
5947 	      sel[i] = i;
5948 	    else
5949 	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5950 	  indices.new_vector (sel, 2, nelt);
5951 	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5952 
5953 	  first_vect = dr_chain[0];
5954 	  second_vect = dr_chain[1];
5955 
5956 	  /* Create interleaving stmt (low part of):
5957 	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958 							     ...}>  */
5959 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5960 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5961 					   second_vect, perm3_mask_low);
5962 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5963 
5964 	  /* Create interleaving stmt (high part of):
5965 	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5966 							      ...}>  */
5967 	  first_vect = data_ref;
5968 	  second_vect = dr_chain[2];
5969 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5970 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5971 					   second_vect, perm3_mask_high);
5972 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5973 	  (*result_chain)[k] = data_ref;
5974 	}
5975     }
5976   else
5977     {
5978       /* If length is not equal to 3 then only power of 2 is supported.  */
5979       gcc_assert (pow2p_hwi (length));
5980 
5981       /* The encoding has a single stepped pattern.  */
5982       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5983       vec_perm_builder sel (nelt, 1, 3);
5984       sel.quick_grow (3);
5985       for (i = 0; i < 3; ++i)
5986 	sel[i] = i * 2;
5987       vec_perm_indices indices (sel, 2, nelt);
5988       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5989 
5990       for (i = 0; i < 3; ++i)
5991 	sel[i] = i * 2 + 1;
5992       indices.new_vector (sel, 2, nelt);
5993       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5994 
5995       for (i = 0; i < log_length; i++)
5996 	{
5997 	  for (j = 0; j < length; j += 2)
5998 	    {
5999 	      first_vect = dr_chain[j];
6000 	      second_vect = dr_chain[j+1];
6001 
6002 	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6003 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6004 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005 					       first_vect, second_vect,
6006 					       perm_mask_even);
6007 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6008 	      (*result_chain)[j/2] = data_ref;
6009 
6010 	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6011 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6012 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6013 					       first_vect, second_vect,
6014 					       perm_mask_odd);
6015 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6016 	      (*result_chain)[j/2+length/2] = data_ref;
6017 	    }
6018 	  memcpy (dr_chain.address (), result_chain->address (),
6019 		  length * sizeof (tree));
6020 	}
6021     }
6022 }
6023 
6024 /* Function vect_shift_permute_load_chain.
6025 
6026    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6027    sequence of stmts to reorder the input data accordingly.
6028    Return the final references for loads in RESULT_CHAIN.
6029    Return true if successed, false otherwise.
6030 
6031    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6032    The input is 3 vectors each containing 8 elements.  We assign a
6033    number to each element, the input sequence is:
6034 
6035    1st vec:   0  1  2  3  4  5  6  7
6036    2nd vec:   8  9 10 11 12 13 14 15
6037    3rd vec:  16 17 18 19 20 21 22 23
6038 
6039    The output sequence should be:
6040 
6041    1st vec:  0 3 6  9 12 15 18 21
6042    2nd vec:  1 4 7 10 13 16 19 22
6043    3rd vec:  2 5 8 11 14 17 20 23
6044 
6045    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6046 
6047    First we shuffle all 3 vectors to get correct elements order:
6048 
6049    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6050    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6051    3rd vec:  (16 19 22) (17 20 23) (18 21)
6052 
6053    Next we unite and shift vector 3 times:
6054 
6055    1st step:
6056      shift right by 6 the concatenation of:
6057      "1st vec" and  "2nd vec"
6058        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6059      "2nd vec" and  "3rd vec"
6060        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6061      "3rd vec" and  "1st vec"
6062        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6063 			     | New vectors                   |
6064 
6065      So that now new vectors are:
6066 
6067      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6068      2nd vec:  (10 13) (16 19 22) (17 20 23)
6069      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6070 
6071    2nd step:
6072      shift right by 5 the concatenation of:
6073      "1st vec" and  "3rd vec"
6074        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6075      "2nd vec" and  "1st vec"
6076        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6077      "3rd vec" and  "2nd vec"
6078        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6079 			  | New vectors                   |
6080 
6081      So that now new vectors are:
6082 
6083      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6084      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6085      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6086 
6087    3rd step:
6088      shift right by 5 the concatenation of:
6089      "1st vec" and  "1st vec"
6090        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6091      shift right by 3 the concatenation of:
6092      "2nd vec" and  "2nd vec"
6093                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6094 			  | New vectors                   |
6095 
6096      So that now all vectors are READY:
6097      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6098      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6099      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6100 
6101    This algorithm is faster than one in vect_permute_load_chain if:
6102      1.  "shift of a concatination" is faster than general permutation.
6103 	 This is usually so.
6104      2.  The TARGET machine can't execute vector instructions in parallel.
6105 	 This is because each step of the algorithm depends on previous.
6106 	 The algorithm in vect_permute_load_chain is much more parallel.
6107 
6108    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6109 */
6110 
6111 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6112 vect_shift_permute_load_chain (vec<tree> dr_chain,
6113 			       unsigned int length,
6114 			       stmt_vec_info stmt_info,
6115 			       gimple_stmt_iterator *gsi,
6116 			       vec<tree> *result_chain)
6117 {
6118   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6119   tree perm2_mask1, perm2_mask2, perm3_mask;
6120   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6121   gimple *perm_stmt;
6122 
6123   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6124   unsigned int i;
6125   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6126 
6127   unsigned HOST_WIDE_INT nelt, vf;
6128   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6129       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6130     /* Not supported for variable-length vectors.  */
6131     return false;
6132 
6133   vec_perm_builder sel (nelt, nelt, 1);
6134   sel.quick_grow (nelt);
6135 
6136   result_chain->quick_grow (length);
6137   memcpy (result_chain->address (), dr_chain.address (),
6138 	  length * sizeof (tree));
6139 
6140   if (pow2p_hwi (length) && vf > 4)
6141     {
6142       unsigned int j, log_length = exact_log2 (length);
6143       for (i = 0; i < nelt / 2; ++i)
6144 	sel[i] = i * 2;
6145       for (i = 0; i < nelt / 2; ++i)
6146 	sel[nelt / 2 + i] = i * 2 + 1;
6147       vec_perm_indices indices (sel, 2, nelt);
6148       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6149 	{
6150 	  if (dump_enabled_p ())
6151 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6152 			     "shuffle of 2 fields structure is not \
6153 			      supported by target\n");
6154 	  return false;
6155 	}
6156       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6157 
6158       for (i = 0; i < nelt / 2; ++i)
6159 	sel[i] = i * 2 + 1;
6160       for (i = 0; i < nelt / 2; ++i)
6161 	sel[nelt / 2 + i] = i * 2;
6162       indices.new_vector (sel, 2, nelt);
6163       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164 	{
6165 	  if (dump_enabled_p ())
6166 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167 			     "shuffle of 2 fields structure is not \
6168 			      supported by target\n");
6169 	  return false;
6170 	}
6171       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6172 
6173       /* Generating permutation constant to shift all elements.
6174 	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6175       for (i = 0; i < nelt; i++)
6176 	sel[i] = nelt / 2 + i;
6177       indices.new_vector (sel, 2, nelt);
6178       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6179 	{
6180 	  if (dump_enabled_p ())
6181 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182 			     "shift permutation is not supported by target\n");
6183 	  return false;
6184 	}
6185       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6186 
6187       /* Generating permutation constant to select vector from 2.
6188 	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6189       for (i = 0; i < nelt / 2; i++)
6190 	sel[i] = i;
6191       for (i = nelt / 2; i < nelt; i++)
6192 	sel[i] = nelt + i;
6193       indices.new_vector (sel, 2, nelt);
6194       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6195 	{
6196 	  if (dump_enabled_p ())
6197 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198 			     "select is not supported by target\n");
6199 	  return false;
6200 	}
6201       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6202 
6203       for (i = 0; i < log_length; i++)
6204 	{
6205 	  for (j = 0; j < length; j += 2)
6206 	    {
6207 	      first_vect = dr_chain[j];
6208 	      second_vect = dr_chain[j + 1];
6209 
6210 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6211 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212 					       first_vect, first_vect,
6213 					       perm2_mask1);
6214 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6215 	      vect[0] = data_ref;
6216 
6217 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6218 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219 					       second_vect, second_vect,
6220 					       perm2_mask2);
6221 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6222 	      vect[1] = data_ref;
6223 
6224 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6225 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6226 					       vect[0], vect[1], shift1_mask);
6227 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6228 	      (*result_chain)[j/2 + length/2] = data_ref;
6229 
6230 	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6231 	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6232 					       vect[0], vect[1], select_mask);
6233 	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6234 	      (*result_chain)[j/2] = data_ref;
6235 	    }
6236 	  memcpy (dr_chain.address (), result_chain->address (),
6237 		  length * sizeof (tree));
6238 	}
6239       return true;
6240     }
6241   if (length == 3 && vf > 2)
6242     {
6243       unsigned int k = 0, l = 0;
6244 
6245       /* Generating permutation constant to get all elements in rigth order.
6246 	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6247       for (i = 0; i < nelt; i++)
6248 	{
6249 	  if (3 * k + (l % 3) >= nelt)
6250 	    {
6251 	      k = 0;
6252 	      l += (3 - (nelt % 3));
6253 	    }
6254 	  sel[i] = 3 * k + (l % 3);
6255 	  k++;
6256 	}
6257       vec_perm_indices indices (sel, 2, nelt);
6258       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259 	{
6260 	  if (dump_enabled_p ())
6261 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262 			     "shuffle of 3 fields structure is not \
6263 			      supported by target\n");
6264 	  return false;
6265 	}
6266       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6267 
6268       /* Generating permutation constant to shift all elements.
6269 	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6270       for (i = 0; i < nelt; i++)
6271 	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6272       indices.new_vector (sel, 2, nelt);
6273       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274 	{
6275 	  if (dump_enabled_p ())
6276 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 			     "shift permutation is not supported by target\n");
6278 	  return false;
6279 	}
6280       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6281 
6282       /* Generating permutation constant to shift all elements.
6283 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6284       for (i = 0; i < nelt; i++)
6285 	sel[i] = 2 * (nelt / 3) + 1 + i;
6286       indices.new_vector (sel, 2, nelt);
6287       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6288 	{
6289 	  if (dump_enabled_p ())
6290 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6291 			     "shift permutation is not supported by target\n");
6292 	  return false;
6293 	}
6294       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6295 
6296       /* Generating permutation constant to shift all elements.
6297 	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6298       for (i = 0; i < nelt; i++)
6299 	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6300       indices.new_vector (sel, 2, nelt);
6301       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6302 	{
6303 	  if (dump_enabled_p ())
6304 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305 			     "shift permutation is not supported by target\n");
6306 	  return false;
6307 	}
6308       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6309 
6310       /* Generating permutation constant to shift all elements.
6311 	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6312       for (i = 0; i < nelt; i++)
6313 	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6314       indices.new_vector (sel, 2, nelt);
6315       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6316 	{
6317 	  if (dump_enabled_p ())
6318 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6319 			     "shift permutation is not supported by target\n");
6320 	  return false;
6321 	}
6322       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6323 
6324       for (k = 0; k < 3; k++)
6325 	{
6326 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6327 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6328 					   dr_chain[k], dr_chain[k],
6329 					   perm3_mask);
6330 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6331 	  vect[k] = data_ref;
6332 	}
6333 
6334       for (k = 0; k < 3; k++)
6335 	{
6336 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6337 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6338 					   vect[k % 3], vect[(k + 1) % 3],
6339 					   shift1_mask);
6340 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6341 	  vect_shift[k] = data_ref;
6342 	}
6343 
6344       for (k = 0; k < 3; k++)
6345 	{
6346 	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6347 	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6348 					   vect_shift[(4 - k) % 3],
6349 					   vect_shift[(3 - k) % 3],
6350 					   shift2_mask);
6351 	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6352 	  vect[k] = data_ref;
6353 	}
6354 
6355       (*result_chain)[3 - (nelt % 3)] = vect[2];
6356 
6357       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6358       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6359 				       vect[0], shift3_mask);
6360       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6361       (*result_chain)[nelt % 3] = data_ref;
6362 
6363       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6364       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6365 				       vect[1], shift4_mask);
6366       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6367       (*result_chain)[0] = data_ref;
6368       return true;
6369     }
6370   return false;
6371 }
6372 
6373 /* Function vect_transform_grouped_load.
6374 
6375    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6376    to perform their permutation and ascribe the result vectorized statements to
6377    the scalar statements.
6378 */
6379 
6380 void
vect_transform_grouped_load(stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6381 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6382 			     int size, gimple_stmt_iterator *gsi)
6383 {
6384   machine_mode mode;
6385   vec<tree> result_chain = vNULL;
6386 
6387   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6388      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6389      vectors, that are ready for vector computation.  */
6390   result_chain.create (size);
6391 
6392   /* If reassociation width for vector type is 2 or greater target machine can
6393      execute 2 or more vector instructions in parallel.  Otherwise try to
6394      get chain for loads group using vect_shift_permute_load_chain.  */
6395   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6396   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6397       || pow2p_hwi (size)
6398       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6399 					 gsi, &result_chain))
6400     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6401   vect_record_grouped_load_vectors (stmt_info, result_chain);
6402   result_chain.release ();
6403 }
6404 
6405 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6406    generated as part of the vectorization of STMT_INFO.  Assign the statement
6407    for each vector to the associated scalar statement.  */
6408 
6409 void
vect_record_grouped_load_vectors(stmt_vec_info stmt_info,vec<tree> result_chain)6410 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6411 				  vec<tree> result_chain)
6412 {
6413   vec_info *vinfo = stmt_info->vinfo;
6414   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6415   unsigned int i, gap_count;
6416   tree tmp_data_ref;
6417 
6418   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6419      Since we scan the chain starting from it's first node, their order
6420      corresponds the order of data-refs in RESULT_CHAIN.  */
6421   stmt_vec_info next_stmt_info = first_stmt_info;
6422   gap_count = 1;
6423   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6424     {
6425       if (!next_stmt_info)
6426 	break;
6427 
6428       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6429        code elimination pass later.  No need to check for the first stmt in
6430        the group, since it always exists.
6431        DR_GROUP_GAP is the number of steps in elements from the previous
6432        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6433        correspond to the gaps.  */
6434       if (next_stmt_info != first_stmt_info
6435 	  && gap_count < DR_GROUP_GAP (next_stmt_info))
6436 	{
6437 	  gap_count++;
6438 	  continue;
6439 	}
6440 
6441       /* ???  The following needs cleanup after the removal of
6442          DR_GROUP_SAME_DR_STMT.  */
6443       if (next_stmt_info)
6444         {
6445 	  stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6446 	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6447 	     copies, and we put the new vector statement in the first available
6448 	     RELATED_STMT.  */
6449 	  if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6450 	    STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6451 	  else
6452             {
6453 	      stmt_vec_info prev_stmt_info
6454 		= STMT_VINFO_VEC_STMT (next_stmt_info);
6455 	      stmt_vec_info rel_stmt_info
6456 		= STMT_VINFO_RELATED_STMT (prev_stmt_info);
6457 	      while (rel_stmt_info)
6458 		{
6459 		  prev_stmt_info = rel_stmt_info;
6460 		  rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6461 		}
6462 
6463 	      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6464             }
6465 
6466 	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6467 	  gap_count = 1;
6468         }
6469     }
6470 }
6471 
6472 /* Function vect_force_dr_alignment_p.
6473 
6474    Returns whether the alignment of a DECL can be forced to be aligned
6475    on ALIGNMENT bit boundary.  */
6476 
6477 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6478 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6479 {
6480   if (!VAR_P (decl))
6481     return false;
6482 
6483   if (decl_in_symtab_p (decl)
6484       && !symtab_node::get (decl)->can_increase_alignment_p ())
6485     return false;
6486 
6487   if (TREE_STATIC (decl))
6488     return (known_le (alignment,
6489 		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6490   else
6491     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6492 }
6493 
6494 
6495 /* Return whether the data reference DR_INFO is supported with respect to its
6496    alignment.
6497    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6498    it is aligned, i.e., check if it is possible to vectorize it with different
6499    alignment.  */
6500 
6501 enum dr_alignment_support
vect_supportable_dr_alignment(dr_vec_info * dr_info,bool check_aligned_accesses)6502 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6503                                bool check_aligned_accesses)
6504 {
6505   data_reference *dr = dr_info->dr;
6506   stmt_vec_info stmt_info = dr_info->stmt;
6507   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6508   machine_mode mode = TYPE_MODE (vectype);
6509   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6510   class loop *vect_loop = NULL;
6511   bool nested_in_vect_loop = false;
6512 
6513   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6514     return dr_aligned;
6515 
6516   /* For now assume all conditional loads/stores support unaligned
6517      access without any special code.  */
6518   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6519     if (gimple_call_internal_p (stmt)
6520 	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6521 	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6522       return dr_unaligned_supported;
6523 
6524   if (loop_vinfo)
6525     {
6526       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6527       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6528     }
6529 
6530   /* Possibly unaligned access.  */
6531 
6532   /* We can choose between using the implicit realignment scheme (generating
6533      a misaligned_move stmt) and the explicit realignment scheme (generating
6534      aligned loads with a REALIGN_LOAD).  There are two variants to the
6535      explicit realignment scheme: optimized, and unoptimized.
6536      We can optimize the realignment only if the step between consecutive
6537      vector loads is equal to the vector size.  Since the vector memory
6538      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6539      is guaranteed that the misalignment amount remains the same throughout the
6540      execution of the vectorized loop.  Therefore, we can create the
6541      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6542      at the loop preheader.
6543 
6544      However, in the case of outer-loop vectorization, when vectorizing a
6545      memory access in the inner-loop nested within the LOOP that is now being
6546      vectorized, while it is guaranteed that the misalignment of the
6547      vectorized memory access will remain the same in different outer-loop
6548      iterations, it is *not* guaranteed that is will remain the same throughout
6549      the execution of the inner-loop.  This is because the inner-loop advances
6550      with the original scalar step (and not in steps of VS).  If the inner-loop
6551      step happens to be a multiple of VS, then the misalignment remains fixed
6552      and we can use the optimized realignment scheme.  For example:
6553 
6554       for (i=0; i<N; i++)
6555         for (j=0; j<M; j++)
6556           s += a[i+j];
6557 
6558      When vectorizing the i-loop in the above example, the step between
6559      consecutive vector loads is 1, and so the misalignment does not remain
6560      fixed across the execution of the inner-loop, and the realignment cannot
6561      be optimized (as illustrated in the following pseudo vectorized loop):
6562 
6563       for (i=0; i<N; i+=4)
6564         for (j=0; j<M; j++){
6565           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6566                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6567                          // (assuming that we start from an aligned address).
6568           }
6569 
6570      We therefore have to use the unoptimized realignment scheme:
6571 
6572       for (i=0; i<N; i+=4)
6573           for (j=k; j<M; j+=4)
6574           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6575                            // that the misalignment of the initial address is
6576                            // 0).
6577 
6578      The loop can then be vectorized as follows:
6579 
6580       for (k=0; k<4; k++){
6581         rt = get_realignment_token (&vp[k]);
6582         for (i=0; i<N; i+=4){
6583           v1 = vp[i+k];
6584           for (j=k; j<M; j+=4){
6585             v2 = vp[i+j+VS-1];
6586             va = REALIGN_LOAD <v1,v2,rt>;
6587             vs += va;
6588             v1 = v2;
6589           }
6590         }
6591     } */
6592 
6593   if (DR_IS_READ (dr))
6594     {
6595       bool is_packed = false;
6596       tree type = (TREE_TYPE (DR_REF (dr)));
6597 
6598       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6599 	  && (!targetm.vectorize.builtin_mask_for_load
6600 	      || targetm.vectorize.builtin_mask_for_load ()))
6601 	{
6602 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6603 
6604 	  /* If we are doing SLP then the accesses need not have the
6605 	     same alignment, instead it depends on the SLP group size.  */
6606 	  if (loop_vinfo
6607 	      && STMT_SLP_TYPE (stmt_info)
6608 	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6609 			      * (DR_GROUP_SIZE
6610 				 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6611 			      TYPE_VECTOR_SUBPARTS (vectype)))
6612 	    ;
6613 	  else if (!loop_vinfo
6614 		   || (nested_in_vect_loop
6615 		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6616 				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6617 	    return dr_explicit_realign;
6618 	  else
6619 	    return dr_explicit_realign_optimized;
6620 	}
6621       if (!known_alignment_for_access_p (dr_info))
6622 	is_packed = not_size_aligned (DR_REF (dr));
6623 
6624       if (targetm.vectorize.support_vector_misalignment
6625 	    (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6626 	/* Can't software pipeline the loads, but can at least do them.  */
6627 	return dr_unaligned_supported;
6628     }
6629   else
6630     {
6631       bool is_packed = false;
6632       tree type = (TREE_TYPE (DR_REF (dr)));
6633 
6634       if (!known_alignment_for_access_p (dr_info))
6635 	is_packed = not_size_aligned (DR_REF (dr));
6636 
6637      if (targetm.vectorize.support_vector_misalignment
6638 	   (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6639        return dr_unaligned_supported;
6640     }
6641 
6642   /* Unsupported.  */
6643   return dr_unaligned_unsupported;
6644 }
6645