1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
60
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
64 {
65 machine_mode mode, array_mode;
66 bool limit_p;
67
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
70 {
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74 {
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s[%wu]\n",
78 GET_MODE_NAME (mode), count);
79 return false;
80 }
81 }
82
83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84 {
85 if (dump_enabled_p ())
86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87 "cannot use %s<%s><%s>\n", name,
88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89 return false;
90 }
91
92 if (dump_enabled_p ())
93 dump_printf_loc (MSG_NOTE, vect_location,
94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95 GET_MODE_NAME (mode));
96
97 return true;
98 }
99
100
101 /* Return the smallest scalar part of STMT_INFO.
102 This is used to determine the vectype of the stmt. We generally set the
103 vectype according to the type of the result (lhs). For stmts whose
104 result-type is different than the type of the arguments (e.g., demotion,
105 promotion), vectype will be reset appropriately (later). Note that we have
106 to visit the smallest datatype in this function, because that determines the
107 VF. If the smallest datatype in the loop is present only as the rhs of a
108 promotion operation - we'd miss it.
109 Such a case, where a variable of this datatype does not appear in the lhs
110 anywhere in the loop, can only occur if it's an invariant: e.g.:
111 'int_x = (int) short_inv', which we'd expect to have been optimized away by
112 invariant motion. However, we cannot rely on invariant motion to always
113 take invariants out of the loop, and so in the case of promotion we also
114 have to check the rhs.
115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116 types. */
117
118 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,tree scalar_type)119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121 HOST_WIDE_INT lhs, rhs;
122
123 /* During the analysis phase, this function is called on arbitrary
124 statements that might not have scalar results. */
125 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126 return scalar_type;
127
128 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129
130 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131 if (assign)
132 {
133 scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134 if (gimple_assign_cast_p (assign)
135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139 || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140 || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142 {
143 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144
145 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146 if (rhs < lhs)
147 scalar_type = rhs_type;
148 }
149 }
150 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151 {
152 unsigned int i = 0;
153 if (gimple_call_internal_p (call))
154 {
155 internal_fn ifn = gimple_call_internal_fn (call);
156 if (internal_load_fn_p (ifn))
157 /* For loads the LHS type does the trick. */
158 i = ~0U;
159 else if (internal_store_fn_p (ifn))
160 {
161 /* For stores use the tyep of the stored value. */
162 i = internal_fn_stored_value_index (ifn);
163 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164 i = ~0U;
165 }
166 else if (internal_fn_mask_index (ifn) == 0)
167 i = 1;
168 }
169 if (i < gimple_call_num_args (call))
170 {
171 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173 {
174 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175 if (rhs < lhs)
176 scalar_type = rhs_type;
177 }
178 }
179 }
180
181 return scalar_type;
182 }
183
184
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186 tested at run-time. Return TRUE if DDR was successfully inserted.
187 Return false if versioning is not supported. */
188
189 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193
194 if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195 return opt_result::failure_at (vect_location,
196 "will not create alias checks, as"
197 " --param vect-max-version-for-alias-checks"
198 " == 0\n");
199
200 opt_result res
201 = runtime_alias_check_p (ddr, loop,
202 optimize_loop_nest_for_speed_p (loop));
203 if (!res)
204 return res;
205
206 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207 return opt_result::success ();
208 }
209
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
211
212 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215 const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216 for (unsigned int i = 0; i < checks.length(); ++i)
217 if (checks[i] == value)
218 return;
219
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location,
222 "need run-time check that %T is nonzero\n",
223 value);
224 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229 DR_INFO_B. At least one of the accesses is a write. */
230
231 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236
237 /* Single statements are always kept in their original order. */
238 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240 return true;
241
242 /* STMT_A and STMT_B belong to overlapping groups. All loads are
243 emitted at the position of the first scalar load.
244 Stores in a group are emitted at the position of the last scalar store.
245 Compute that position and check whether the resulting order matches
246 the current one. */
247 stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248 if (il_a)
249 {
250 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252 s = DR_GROUP_NEXT_ELEMENT (s))
253 il_a = get_later_stmt (il_a, s);
254 else /* DR_IS_READ */
255 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256 s = DR_GROUP_NEXT_ELEMENT (s))
257 if (get_later_stmt (il_a, s) == il_a)
258 il_a = s;
259 }
260 else
261 il_a = stmtinfo_a;
262 stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263 if (il_b)
264 {
265 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267 s = DR_GROUP_NEXT_ELEMENT (s))
268 il_b = get_later_stmt (il_b, s);
269 else /* DR_IS_READ */
270 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271 s = DR_GROUP_NEXT_ELEMENT (s))
272 if (get_later_stmt (il_b, s) == il_b)
273 il_b = s;
274 }
275 else
276 il_b = stmtinfo_b;
277 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278 return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280
281 /* A subroutine of vect_analyze_data_ref_dependence. Handle
282 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283 distances. These distances are conservatively correct but they don't
284 reflect a guaranteed dependence.
285
286 Return true if this function does all the work necessary to avoid
287 an alias or false if the caller should use the dependence distances
288 to limit the vectorization factor in the usual way. LOOP_DEPTH is
289 the depth of the loop described by LOOP_VINFO and the other arguments
290 are as for vect_analyze_data_ref_dependence. */
291
292 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294 loop_vec_info loop_vinfo,
295 int loop_depth, unsigned int *max_vf)
296 {
297 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298 for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299 {
300 int dist = dist_v[loop_depth];
301 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302 {
303 /* If the user asserted safelen >= DIST consecutive iterations
304 can be executed concurrently, assume independence.
305
306 ??? An alternative would be to add the alias check even
307 in this case, and vectorize the fallback loop with the
308 maximum VF set to safelen. However, if the user has
309 explicitly given a length, it's less likely that that
310 would be a win. */
311 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312 {
313 if ((unsigned int) loop->safelen < *max_vf)
314 *max_vf = loop->safelen;
315 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316 continue;
317 }
318
319 /* For dependence distances of 2 or more, we have the option
320 of limiting VF or checking for an alias at runtime.
321 Prefer to check at runtime if we can, to avoid limiting
322 the VF unnecessarily when the bases are in fact independent.
323
324 Note that the alias checks will be removed if the VF ends up
325 being small enough. */
326 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331 }
332 }
333 return true;
334 }
335
336
337 /* Function vect_analyze_data_ref_dependence.
338
339 FIXME: I needed to change the sense of the returned flag.
340
341 Return FALSE if there (might) exist a dependence between a memory-reference
342 DRA and a memory-reference DRB. When versioning for alias may check a
343 dependence at run-time, return TRUE. Adjust *MAX_VF according to
344 the data dependence. */
345
346 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348 loop_vec_info loop_vinfo,
349 unsigned int *max_vf)
350 {
351 unsigned int i;
352 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353 struct data_reference *dra = DDR_A (ddr);
354 struct data_reference *drb = DDR_B (ddr);
355 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359 lambda_vector dist_v;
360 unsigned int loop_depth;
361
362 /* If user asserted safelen consecutive iterations can be
363 executed concurrently, assume independence. */
364 auto apply_safelen = [&]()
365 {
366 if (loop->safelen >= 2)
367 {
368 if ((unsigned int) loop->safelen < *max_vf)
369 *max_vf = loop->safelen;
370 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371 return true;
372 }
373 return false;
374 };
375
376 /* In loop analysis all data references should be vectorizable. */
377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379 gcc_unreachable ();
380
381 /* Independent data accesses. */
382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383 return opt_result::success ();
384
385 if (dra == drb
386 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387 return opt_result::success ();
388
389 /* We do not have to consider dependences between accesses that belong
390 to the same group, unless the stride could be smaller than the
391 group size. */
392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395 && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396 return opt_result::success ();
397
398 /* Even if we have an anti-dependence then, as the vectorized loop covers at
399 least two scalar iterations, there is always also a true dependence.
400 As the vectorizer does not re-order loads and stores we can ignore
401 the anti-dependence if TBAA can disambiguate both DRs similar to the
402 case with known negative distance anti-dependences (positive
403 distance anti-dependences would violate TBAA constraints). */
404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407 get_alias_set (DR_REF (drb))))
408 return opt_result::success ();
409
410 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412 {
413 if (apply_safelen ())
414 return opt_result::success ();
415
416 return opt_result::failure_at
417 (stmtinfo_a->stmt,
418 "possible alias involving gather/scatter between %T and %T\n",
419 DR_REF (dra), DR_REF (drb));
420 }
421
422 /* Unknown data dependence. */
423 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
424 {
425 if (apply_safelen ())
426 return opt_result::success ();
427
428 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
430 "versioning for alias required: "
431 "can't determine dependence between %T and %T\n",
432 DR_REF (dra), DR_REF (drb));
433
434 /* Add to list of ddrs that need to be tested at run-time. */
435 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436 }
437
438 /* Known data dependence. */
439 if (DDR_NUM_DIST_VECTS (ddr) == 0)
440 {
441 if (apply_safelen ())
442 return opt_result::success ();
443
444 if (dump_enabled_p ())
445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
446 "versioning for alias required: "
447 "bad dist vector for %T and %T\n",
448 DR_REF (dra), DR_REF (drb));
449 /* Add to list of ddrs that need to be tested at run-time. */
450 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451 }
452
453 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454
455 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
456 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
457 loop_depth, max_vf))
458 return opt_result::success ();
459
460 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461 {
462 int dist = dist_v[loop_depth];
463
464 if (dump_enabled_p ())
465 dump_printf_loc (MSG_NOTE, vect_location,
466 "dependence distance = %d.\n", dist);
467
468 if (dist == 0)
469 {
470 if (dump_enabled_p ())
471 dump_printf_loc (MSG_NOTE, vect_location,
472 "dependence distance == 0 between %T and %T\n",
473 DR_REF (dra), DR_REF (drb));
474
475 /* When we perform grouped accesses and perform implicit CSE
476 by detecting equal accesses and doing disambiguation with
477 runtime alias tests like for
478 .. = a[i];
479 .. = a[i+1];
480 a[i] = ..;
481 a[i+1] = ..;
482 *p = ..;
483 .. = a[i];
484 .. = a[i+1];
485 where we will end up loading { a[i], a[i+1] } once, make
486 sure that inserting group loads before the first load and
487 stores after the last store will do the right thing.
488 Similar for groups like
489 a[i] = ...;
490 ... = a[i];
491 a[i+1] = ...;
492 where loads from the group interleave with the store. */
493 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
494 return opt_result::failure_at (stmtinfo_a->stmt,
495 "READ_WRITE dependence"
496 " in interleaving.\n");
497
498 if (loop->safelen < 2)
499 {
500 tree indicator = dr_zero_step_indicator (dra);
501 if (!indicator || integer_zerop (indicator))
502 return opt_result::failure_at (stmtinfo_a->stmt,
503 "access also has a zero step\n");
504 else if (TREE_CODE (indicator) != INTEGER_CST)
505 vect_check_nonzero_value (loop_vinfo, indicator);
506 }
507 continue;
508 }
509
510 if (dist > 0 && DDR_REVERSED_P (ddr))
511 {
512 /* If DDR_REVERSED_P the order of the data-refs in DDR was
513 reversed (to make distance vector positive), and the actual
514 distance is negative. */
515 if (dump_enabled_p ())
516 dump_printf_loc (MSG_NOTE, vect_location,
517 "dependence distance negative.\n");
518 /* When doing outer loop vectorization, we need to check if there is
519 a backward dependence at the inner loop level if the dependence
520 at the outer loop is reversed. See PR81740. */
521 if (nested_in_vect_loop_p (loop, stmtinfo_a)
522 || nested_in_vect_loop_p (loop, stmtinfo_b))
523 {
524 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
525 DDR_LOOP_NEST (ddr));
526 if (dist_v[inner_depth] < 0)
527 return opt_result::failure_at (stmtinfo_a->stmt,
528 "not vectorized, dependence "
529 "between data-refs %T and %T\n",
530 DR_REF (dra), DR_REF (drb));
531 }
532 /* Record a negative dependence distance to later limit the
533 amount of stmt copying / unrolling we can perform.
534 Only need to handle read-after-write dependence. */
535 if (DR_IS_READ (drb)
536 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
537 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
538 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
539 continue;
540 }
541
542 unsigned int abs_dist = abs (dist);
543 if (abs_dist >= 2 && abs_dist < *max_vf)
544 {
545 /* The dependence distance requires reduction of the maximal
546 vectorization factor. */
547 *max_vf = abs_dist;
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location,
550 "adjusting maximal vectorization factor to %i\n",
551 *max_vf);
552 }
553
554 if (abs_dist >= *max_vf)
555 {
556 /* Dependence distance does not create dependence, as far as
557 vectorization is concerned, in this case. */
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "dependence distance >= VF.\n");
561 continue;
562 }
563
564 return opt_result::failure_at (stmtinfo_a->stmt,
565 "not vectorized, possible dependence "
566 "between data-refs %T and %T\n",
567 DR_REF (dra), DR_REF (drb));
568 }
569
570 return opt_result::success ();
571 }
572
573 /* Function vect_analyze_data_ref_dependences.
574
575 Examine all the data references in the loop, and make sure there do not
576 exist any data dependences between them. Set *MAX_VF according to
577 the maximum vectorization factor the data dependences allow. */
578
579 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
581 unsigned int *max_vf)
582 {
583 unsigned int i;
584 struct data_dependence_relation *ddr;
585
586 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587
588 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589 {
590 LOOP_VINFO_DDRS (loop_vinfo)
591 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
592 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
593 /* We do not need read-read dependences. */
594 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595 &LOOP_VINFO_DDRS (loop_vinfo),
596 LOOP_VINFO_LOOP_NEST (loop_vinfo),
597 false);
598 gcc_assert (res);
599 }
600
601 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602
603 /* For epilogues we either have no aliases or alias versioning
604 was applied to original loop. Therefore we may just get max_vf
605 using VF of original loop. */
606 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608 else
609 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610 {
611 opt_result res
612 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613 if (!res)
614 return res;
615 }
616
617 return opt_result::success ();
618 }
619
620
621 /* Function vect_slp_analyze_data_ref_dependence.
622
623 Return TRUE if there (might) exist a dependence between a memory-reference
624 DRA and a memory-reference DRB for VINFO. When versioning for alias
625 may check a dependence at run-time, return FALSE. Adjust *MAX_VF
626 according to the data dependence. */
627
628 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630 struct data_dependence_relation *ddr)
631 {
632 struct data_reference *dra = DDR_A (ddr);
633 struct data_reference *drb = DDR_B (ddr);
634 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636
637 /* We need to check dependences of statements marked as unvectorizable
638 as well, they still can prohibit vectorization. */
639
640 /* Independent data accesses. */
641 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642 return false;
643
644 if (dra == drb)
645 return false;
646
647 /* Read-read is OK. */
648 if (DR_IS_READ (dra) && DR_IS_READ (drb))
649 return false;
650
651 /* If dra and drb are part of the same interleaving chain consider
652 them independent. */
653 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656 return false;
657
658 /* Unknown data dependence. */
659 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660 {
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "can't determine dependence between %T and %T\n",
664 DR_REF (dra), DR_REF (drb));
665 }
666 else if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location,
668 "determined dependence between %T and %T\n",
669 DR_REF (dra), DR_REF (drb));
670
671 return true;
672 }
673
674
675 /* Analyze dependences involved in the transform of a store SLP NODE. */
676
677 static bool
vect_slp_analyze_store_dependences(vec_info * vinfo,slp_tree node)678 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
679 {
680 /* This walks over all stmts involved in the SLP store done
681 in NODE verifying we can sink them up to the last stmt in the
682 group. */
683 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
684 gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
685
686 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
687 {
688 stmt_vec_info access_info
689 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
690 if (access_info == last_access_info)
691 continue;
692 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
693 ao_ref ref;
694 bool ref_initialized_p = false;
695 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
696 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
697 {
698 gimple *stmt = gsi_stmt (gsi);
699 if (! gimple_vuse (stmt))
700 continue;
701
702 /* If we couldn't record a (single) data reference for this
703 stmt we have to resort to the alias oracle. */
704 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
705 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
706 if (!dr_b)
707 {
708 /* We are moving a store - this means
709 we cannot use TBAA for disambiguation. */
710 if (!ref_initialized_p)
711 ao_ref_init (&ref, DR_REF (dr_a));
712 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
713 || ref_maybe_used_by_stmt_p (stmt, &ref, false))
714 return false;
715 continue;
716 }
717
718 gcc_assert (!gimple_visited_p (stmt));
719
720 ddr_p ddr = initialize_data_dependence_relation (dr_a,
721 dr_b, vNULL);
722 bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
723 free_dependence_relation (ddr);
724 if (dependent)
725 return false;
726 }
727 }
728 return true;
729 }
730
731 /* Analyze dependences involved in the transform of a load SLP NODE. STORES
732 contain the vector of scalar stores of this instance if we are
733 disambiguating the loads. */
734
735 static bool
vect_slp_analyze_load_dependences(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)736 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
737 vec<stmt_vec_info> stores,
738 stmt_vec_info last_store_info)
739 {
740 /* This walks over all stmts involved in the SLP load done
741 in NODE verifying we can hoist them up to the first stmt in the
742 group. */
743 stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
744 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
745
746 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
747 {
748 stmt_vec_info access_info
749 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
750 if (access_info == first_access_info)
751 continue;
752 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
753 ao_ref ref;
754 bool ref_initialized_p = false;
755 hash_set<stmt_vec_info> grp_visited;
756 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
757 gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
758 {
759 gimple *stmt = gsi_stmt (gsi);
760 if (! gimple_vdef (stmt))
761 continue;
762
763 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
764
765 /* If we run into a store of this same instance (we've just
766 marked those) then delay dependence checking until we run
767 into the last store because this is where it will have
768 been sunk to (and we verified that we can do that already). */
769 if (gimple_visited_p (stmt))
770 {
771 if (stmt_info != last_store_info)
772 continue;
773
774 for (stmt_vec_info &store_info : stores)
775 {
776 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
777 ddr_p ddr = initialize_data_dependence_relation
778 (dr_a, store_dr, vNULL);
779 bool dependent
780 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
781 free_dependence_relation (ddr);
782 if (dependent)
783 return false;
784 }
785 continue;
786 }
787
788 auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
789 {
790 /* We are hoisting a load - this means we can use TBAA for
791 disambiguation. */
792 if (!ref_initialized_p)
793 ao_ref_init (&ref, DR_REF (dr_a));
794 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
795 {
796 /* If we couldn't record a (single) data reference for this
797 stmt we have to give up now. */
798 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
799 if (!dr_b)
800 return false;
801 ddr_p ddr = initialize_data_dependence_relation (dr_a,
802 dr_b, vNULL);
803 bool dependent
804 = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
805 free_dependence_relation (ddr);
806 if (dependent)
807 return false;
808 }
809 /* No dependence. */
810 return true;
811 };
812 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
813 {
814 /* When we run into a store group we have to honor
815 that earlier stores might be moved here. We don't
816 know exactly which and where to since we lack a
817 back-mapping from DR to SLP node, so assume all
818 earlier stores are sunk here. It's enough to
819 consider the last stmt of a group for this.
820 ??? Both this and the fact that we disregard that
821 the conflicting instance might be removed later
822 is overly conservative. */
823 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
824 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
825 store_info != NULL;
826 store_info = DR_GROUP_NEXT_ELEMENT (store_info))
827 if ((store_info == stmt_info
828 || get_later_stmt (store_info, stmt_info) == stmt_info)
829 && !check_hoist (store_info))
830 return false;
831 }
832 else
833 {
834 if (!check_hoist (stmt_info))
835 return false;
836 }
837 }
838 }
839 return true;
840 }
841
842
843 /* Function vect_analyze_data_ref_dependences.
844
845 Examine all the data references in the basic-block, and make sure there
846 do not exist any data dependences between them. Set *MAX_VF according to
847 the maximum vectorization factor the data dependences allow. */
848
849 bool
vect_slp_analyze_instance_dependence(vec_info * vinfo,slp_instance instance)850 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
851 {
852 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
853
854 /* The stores of this instance are at the root of the SLP tree. */
855 slp_tree store = NULL;
856 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
857 store = SLP_INSTANCE_TREE (instance);
858
859 /* Verify we can sink stores to the vectorized stmt insert location. */
860 stmt_vec_info last_store_info = NULL;
861 if (store)
862 {
863 if (! vect_slp_analyze_store_dependences (vinfo, store))
864 return false;
865
866 /* Mark stores in this instance and remember the last one. */
867 last_store_info = vect_find_last_scalar_stmt_in_slp (store);
868 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
869 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
870 }
871
872 bool res = true;
873
874 /* Verify we can sink loads to the vectorized stmt insert location,
875 special-casing stores of this instance. */
876 for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
877 if (! vect_slp_analyze_load_dependences (vinfo, load,
878 store
879 ? SLP_TREE_SCALAR_STMTS (store)
880 : vNULL, last_store_info))
881 {
882 res = false;
883 break;
884 }
885
886 /* Unset the visited flag. */
887 if (store)
888 for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
889 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
890
891 return res;
892 }
893
894 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
895 applied. */
896
897 int
dr_misalignment(dr_vec_info * dr_info,tree vectype,poly_int64 offset)898 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
899 {
900 HOST_WIDE_INT diff = 0;
901 /* Alignment is only analyzed for the first element of a DR group,
902 use that but adjust misalignment by the offset of the access. */
903 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
904 {
905 dr_vec_info *first_dr
906 = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
907 /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
908 INTEGER_CSTs and the first element in the group has the lowest
909 address. */
910 diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
911 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
912 gcc_assert (diff >= 0);
913 dr_info = first_dr;
914 }
915
916 int misalign = dr_info->misalignment;
917 gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
918 if (misalign == DR_MISALIGNMENT_UNKNOWN)
919 return misalign;
920
921 /* If the access is only aligned for a vector type with smaller alignment
922 requirement the access has unknown misalignment. */
923 if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
924 targetm.vectorize.preferred_vector_alignment (vectype)))
925 return DR_MISALIGNMENT_UNKNOWN;
926
927 /* Apply the offset from the DR group start and the externally supplied
928 offset which can for example result from a negative stride access. */
929 poly_int64 misalignment = misalign + diff + offset;
930
931 /* vect_compute_data_ref_alignment will have ensured that target_alignment
932 is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN. */
933 unsigned HOST_WIDE_INT target_alignment_c
934 = dr_info->target_alignment.to_constant ();
935 if (!known_misalignment (misalignment, target_alignment_c, &misalign))
936 return DR_MISALIGNMENT_UNKNOWN;
937 return misalign;
938 }
939
940 /* Record the base alignment guarantee given by DRB, which occurs
941 in STMT_INFO. */
942
943 static void
vect_record_base_alignment(vec_info * vinfo,stmt_vec_info stmt_info,innermost_loop_behavior * drb)944 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
945 innermost_loop_behavior *drb)
946 {
947 bool existed;
948 std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
949 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
950 if (!existed || entry.second->base_alignment < drb->base_alignment)
951 {
952 entry = std::make_pair (stmt_info, drb);
953 if (dump_enabled_p ())
954 dump_printf_loc (MSG_NOTE, vect_location,
955 "recording new base alignment for %T\n"
956 " alignment: %d\n"
957 " misalignment: %d\n"
958 " based on: %G",
959 drb->base_address,
960 drb->base_alignment,
961 drb->base_misalignment,
962 stmt_info->stmt);
963 }
964 }
965
966 /* If the region we're going to vectorize is reached, all unconditional
967 data references occur at least once. We can therefore pool the base
968 alignment guarantees from each unconditional reference. Do this by
969 going through all the data references in VINFO and checking whether
970 the containing statement makes the reference unconditionally. If so,
971 record the alignment of the base address in VINFO so that it can be
972 used for all other references with the same base. */
973
974 void
vect_record_base_alignments(vec_info * vinfo)975 vect_record_base_alignments (vec_info *vinfo)
976 {
977 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
978 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
979 for (data_reference *dr : vinfo->shared->datarefs)
980 {
981 dr_vec_info *dr_info = vinfo->lookup_dr (dr);
982 stmt_vec_info stmt_info = dr_info->stmt;
983 if (!DR_IS_CONDITIONAL_IN_STMT (dr)
984 && STMT_VINFO_VECTORIZABLE (stmt_info)
985 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
986 {
987 vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
988
989 /* If DR is nested in the loop that is being vectorized, we can also
990 record the alignment of the base wrt the outer loop. */
991 if (loop && nested_in_vect_loop_p (loop, stmt_info))
992 vect_record_base_alignment
993 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
994 }
995 }
996 }
997
998 /* Function vect_compute_data_ref_alignment
999
1000 Compute the misalignment of the data reference DR_INFO when vectorizing
1001 with VECTYPE.
1002
1003 Output:
1004 1. initialized misalignment info for DR_INFO
1005
1006 FOR NOW: No analysis is actually performed. Misalignment is calculated
1007 only for trivial cases. TODO. */
1008
1009 static void
vect_compute_data_ref_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype)1010 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1011 tree vectype)
1012 {
1013 stmt_vec_info stmt_info = dr_info->stmt;
1014 vec_base_alignments *base_alignments = &vinfo->base_alignments;
1015 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1016 class loop *loop = NULL;
1017 tree ref = DR_REF (dr_info->dr);
1018
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_NOTE, vect_location,
1021 "vect_compute_data_ref_alignment:\n");
1022
1023 if (loop_vinfo)
1024 loop = LOOP_VINFO_LOOP (loop_vinfo);
1025
1026 /* Initialize misalignment to unknown. */
1027 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1028
1029 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1030 return;
1031
1032 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1033 bool step_preserves_misalignment_p;
1034
1035 poly_uint64 vector_alignment
1036 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1037 BITS_PER_UNIT);
1038 SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1039
1040 /* If the main loop has peeled for alignment we have no way of knowing
1041 whether the data accesses in the epilogues are aligned. We can't at
1042 compile time answer the question whether we have entered the main loop or
1043 not. Fixes PR 92351. */
1044 if (loop_vinfo)
1045 {
1046 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1047 if (orig_loop_vinfo
1048 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1049 return;
1050 }
1051
1052 unsigned HOST_WIDE_INT vect_align_c;
1053 if (!vector_alignment.is_constant (&vect_align_c))
1054 return;
1055
1056 /* No step for BB vectorization. */
1057 if (!loop)
1058 {
1059 gcc_assert (integer_zerop (drb->step));
1060 step_preserves_misalignment_p = true;
1061 }
1062
1063 /* In case the dataref is in an inner-loop of the loop that is being
1064 vectorized (LOOP), we use the base and misalignment information
1065 relative to the outer-loop (LOOP). This is ok only if the misalignment
1066 stays the same throughout the execution of the inner-loop, which is why
1067 we have to check that the stride of the dataref in the inner-loop evenly
1068 divides by the vector alignment. */
1069 else if (nested_in_vect_loop_p (loop, stmt_info))
1070 {
1071 step_preserves_misalignment_p
1072 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1073
1074 if (dump_enabled_p ())
1075 {
1076 if (step_preserves_misalignment_p)
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "inner step divides the vector alignment.\n");
1079 else
1080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081 "inner step doesn't divide the vector"
1082 " alignment.\n");
1083 }
1084 }
1085
1086 /* Similarly we can only use base and misalignment information relative to
1087 an innermost loop if the misalignment stays the same throughout the
1088 execution of the loop. As above, this is the case if the stride of
1089 the dataref evenly divides by the alignment. */
1090 else
1091 {
1092 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1093 step_preserves_misalignment_p
1094 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1095
1096 if (!step_preserves_misalignment_p && dump_enabled_p ())
1097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098 "step doesn't divide the vector alignment.\n");
1099 }
1100
1101 unsigned int base_alignment = drb->base_alignment;
1102 unsigned int base_misalignment = drb->base_misalignment;
1103
1104 /* Calculate the maximum of the pooled base address alignment and the
1105 alignment that we can compute for DR itself. */
1106 std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1107 = base_alignments->get (drb->base_address);
1108 if (entry
1109 && base_alignment < (*entry).second->base_alignment
1110 && (loop_vinfo
1111 || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1112 gimple_bb (entry->first->stmt))
1113 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1114 || (entry->first->dr_aux.group <= dr_info->group)))))
1115 {
1116 base_alignment = entry->second->base_alignment;
1117 base_misalignment = entry->second->base_misalignment;
1118 }
1119
1120 if (drb->offset_alignment < vect_align_c
1121 || !step_preserves_misalignment_p
1122 /* We need to know whether the step wrt the vectorized loop is
1123 negative when computing the starting misalignment below. */
1124 || TREE_CODE (drb->step) != INTEGER_CST)
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "Unknown alignment for access: %T\n", ref);
1129 return;
1130 }
1131
1132 if (base_alignment < vect_align_c)
1133 {
1134 unsigned int max_alignment;
1135 tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1136 if (max_alignment < vect_align_c
1137 || !vect_can_force_dr_alignment_p (base,
1138 vect_align_c * BITS_PER_UNIT))
1139 {
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE, vect_location,
1142 "can't force alignment of ref: %T\n", ref);
1143 return;
1144 }
1145
1146 /* Force the alignment of the decl.
1147 NOTE: This is the only change to the code we make during
1148 the analysis phase, before deciding to vectorize the loop. */
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_NOTE, vect_location,
1151 "force alignment of %T\n", ref);
1152
1153 dr_info->base_decl = base;
1154 dr_info->base_misaligned = true;
1155 base_misalignment = 0;
1156 }
1157 poly_int64 misalignment
1158 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1159
1160 unsigned int const_misalignment;
1161 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1162 {
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Non-constant misalignment for access: %T\n", ref);
1166 return;
1167 }
1168
1169 SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1170
1171 if (dump_enabled_p ())
1172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173 "misalign = %d bytes of ref %T\n",
1174 const_misalignment, ref);
1175
1176 return;
1177 }
1178
1179 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1180 that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1181 is made aligned via peeling. */
1182
1183 static bool
vect_dr_aligned_if_related_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1184 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1185 dr_vec_info *dr_peel_info)
1186 {
1187 if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1188 DR_TARGET_ALIGNMENT (dr_info)))
1189 {
1190 poly_offset_int diff
1191 = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1192 - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1193 if (known_eq (diff, 0)
1194 || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1195 return true;
1196 }
1197 return false;
1198 }
1199
1200 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1201 aligned via peeling. */
1202
1203 static bool
vect_dr_aligned_if_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1204 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1205 dr_vec_info *dr_peel_info)
1206 {
1207 if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1208 DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1209 || !operand_equal_p (DR_OFFSET (dr_info->dr),
1210 DR_OFFSET (dr_peel_info->dr), 0)
1211 || !operand_equal_p (DR_STEP (dr_info->dr),
1212 DR_STEP (dr_peel_info->dr), 0))
1213 return false;
1214
1215 return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1216 }
1217
1218 /* Compute the value for dr_info->misalign so that the access appears
1219 aligned. This is used by peeling to compensate for dr_misalignment
1220 applying the offset for negative step. */
1221
1222 int
vect_dr_misalign_for_aligned_access(dr_vec_info * dr_info)1223 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1224 {
1225 if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1226 return 0;
1227
1228 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1229 poly_int64 misalignment
1230 = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1231 * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1232
1233 unsigned HOST_WIDE_INT target_alignment_c;
1234 int misalign;
1235 if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1236 || !known_misalignment (misalignment, target_alignment_c, &misalign))
1237 return DR_MISALIGNMENT_UNKNOWN;
1238 return misalign;
1239 }
1240
1241 /* Function vect_update_misalignment_for_peel.
1242 Sets DR_INFO's misalignment
1243 - to 0 if it has the same alignment as DR_PEEL_INFO,
1244 - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1245 - to -1 (unknown) otherwise.
1246
1247 DR_INFO - the data reference whose misalignment is to be adjusted.
1248 DR_PEEL_INFO - the data reference whose misalignment is being made
1249 zero in the vector loop by the peel.
1250 NPEEL - the number of iterations in the peel loop if the misalignment
1251 of DR_PEEL_INFO is known at compile time. */
1252
1253 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1254 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1255 dr_vec_info *dr_peel_info, int npeel)
1256 {
1257 /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1258 if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1259 {
1260 SET_DR_MISALIGNMENT (dr_info,
1261 vect_dr_misalign_for_aligned_access (dr_peel_info));
1262 return;
1263 }
1264
1265 unsigned HOST_WIDE_INT alignment;
1266 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1267 && known_alignment_for_access_p (dr_info,
1268 STMT_VINFO_VECTYPE (dr_info->stmt))
1269 && known_alignment_for_access_p (dr_peel_info,
1270 STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1271 {
1272 int misal = dr_info->misalignment;
1273 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1274 misal &= alignment - 1;
1275 set_dr_misalignment (dr_info, misal);
1276 return;
1277 }
1278
1279 if (dump_enabled_p ())
1280 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1281 "to unknown (-1).\n");
1282 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1283 }
1284
1285 /* Return true if alignment is relevant for DR_INFO. */
1286
1287 static bool
vect_relevant_for_alignment_p(dr_vec_info * dr_info)1288 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1289 {
1290 stmt_vec_info stmt_info = dr_info->stmt;
1291
1292 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1293 return false;
1294
1295 /* For interleaving, only the alignment of the first access matters. */
1296 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1297 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1298 return false;
1299
1300 /* Scatter-gather and invariant accesses continue to address individual
1301 scalars, so vector-level alignment is irrelevant. */
1302 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1303 || integer_zerop (DR_STEP (dr_info->dr)))
1304 return false;
1305
1306 /* Strided accesses perform only component accesses, alignment is
1307 irrelevant for them. */
1308 if (STMT_VINFO_STRIDED_P (stmt_info)
1309 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1310 return false;
1311
1312 return true;
1313 }
1314
1315 /* Given an memory reference EXP return whether its alignment is less
1316 than its size. */
1317
1318 static bool
not_size_aligned(tree exp)1319 not_size_aligned (tree exp)
1320 {
1321 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1322 return true;
1323
1324 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1325 > get_object_alignment (exp));
1326 }
1327
1328 /* Function vector_alignment_reachable_p
1329
1330 Return true if vector alignment for DR_INFO is reachable by peeling
1331 a few loop iterations. Return false otherwise. */
1332
1333 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1334 vector_alignment_reachable_p (dr_vec_info *dr_info)
1335 {
1336 stmt_vec_info stmt_info = dr_info->stmt;
1337 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1338
1339 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1340 {
1341 /* For interleaved access we peel only if number of iterations in
1342 the prolog loop ({VF - misalignment}), is a multiple of the
1343 number of the interleaved accesses. */
1344 int elem_size, mis_in_elements;
1345
1346 /* FORNOW: handle only known alignment. */
1347 if (!known_alignment_for_access_p (dr_info, vectype))
1348 return false;
1349
1350 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1351 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1352 elem_size = vector_element_size (vector_size, nelements);
1353 mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1354
1355 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1356 return false;
1357 }
1358
1359 /* If misalignment is known at the compile time then allow peeling
1360 only if natural alignment is reachable through peeling. */
1361 if (known_alignment_for_access_p (dr_info, vectype)
1362 && !aligned_access_p (dr_info, vectype))
1363 {
1364 HOST_WIDE_INT elmsize =
1365 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1366 if (dump_enabled_p ())
1367 {
1368 dump_printf_loc (MSG_NOTE, vect_location,
1369 "data size = %wd. misalignment = %d.\n", elmsize,
1370 dr_misalignment (dr_info, vectype));
1371 }
1372 if (dr_misalignment (dr_info, vectype) % elmsize)
1373 {
1374 if (dump_enabled_p ())
1375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 "data size does not divide the misalignment.\n");
1377 return false;
1378 }
1379 }
1380
1381 if (!known_alignment_for_access_p (dr_info, vectype))
1382 {
1383 tree type = TREE_TYPE (DR_REF (dr_info->dr));
1384 bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1385 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 "Unknown misalignment, %snaturally aligned\n",
1388 is_packed ? "not " : "");
1389 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1390 }
1391
1392 return true;
1393 }
1394
1395
1396 /* Calculate the cost of the memory access represented by DR_INFO. */
1397
1398 static void
vect_get_data_access_cost(vec_info * vinfo,dr_vec_info * dr_info,dr_alignment_support alignment_support_scheme,int misalignment,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1399 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1400 dr_alignment_support alignment_support_scheme,
1401 int misalignment,
1402 unsigned int *inside_cost,
1403 unsigned int *outside_cost,
1404 stmt_vector_for_cost *body_cost_vec,
1405 stmt_vector_for_cost *prologue_cost_vec)
1406 {
1407 stmt_vec_info stmt_info = dr_info->stmt;
1408 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1409 int ncopies;
1410
1411 if (PURE_SLP_STMT (stmt_info))
1412 ncopies = 1;
1413 else
1414 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1415
1416 if (DR_IS_READ (dr_info->dr))
1417 vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1418 misalignment, true, inside_cost,
1419 outside_cost, prologue_cost_vec, body_cost_vec, false);
1420 else
1421 vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1422 misalignment, inside_cost, body_cost_vec);
1423
1424 if (dump_enabled_p ())
1425 dump_printf_loc (MSG_NOTE, vect_location,
1426 "vect_get_data_access_cost: inside_cost = %d, "
1427 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1428 }
1429
1430
1431 typedef struct _vect_peel_info
1432 {
1433 dr_vec_info *dr_info;
1434 int npeel;
1435 unsigned int count;
1436 } *vect_peel_info;
1437
1438 typedef struct _vect_peel_extended_info
1439 {
1440 vec_info *vinfo;
1441 struct _vect_peel_info peel_info;
1442 unsigned int inside_cost;
1443 unsigned int outside_cost;
1444 } *vect_peel_extended_info;
1445
1446
1447 /* Peeling hashtable helpers. */
1448
1449 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1450 {
1451 static inline hashval_t hash (const _vect_peel_info *);
1452 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1453 };
1454
1455 inline hashval_t
hash(const _vect_peel_info * peel_info)1456 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1457 {
1458 return (hashval_t) peel_info->npeel;
1459 }
1460
1461 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1462 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1463 {
1464 return (a->npeel == b->npeel);
1465 }
1466
1467
1468 /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1469
1470 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel,bool supportable_if_not_aligned)1471 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1472 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1473 int npeel, bool supportable_if_not_aligned)
1474 {
1475 struct _vect_peel_info elem, *slot;
1476 _vect_peel_info **new_slot;
1477
1478 elem.npeel = npeel;
1479 slot = peeling_htab->find (&elem);
1480 if (slot)
1481 slot->count++;
1482 else
1483 {
1484 slot = XNEW (struct _vect_peel_info);
1485 slot->npeel = npeel;
1486 slot->dr_info = dr_info;
1487 slot->count = 1;
1488 new_slot = peeling_htab->find_slot (slot, INSERT);
1489 *new_slot = slot;
1490 }
1491
1492 /* If this DR is not supported with unknown misalignment then bias
1493 this slot when the cost model is disabled. */
1494 if (!supportable_if_not_aligned
1495 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1496 slot->count += VECT_MAX_COST;
1497 }
1498
1499
1500 /* Traverse peeling hash table to find peeling option that aligns maximum
1501 number of data accesses. */
1502
1503 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1504 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1505 _vect_peel_extended_info *max)
1506 {
1507 vect_peel_info elem = *slot;
1508
1509 if (elem->count > max->peel_info.count
1510 || (elem->count == max->peel_info.count
1511 && max->peel_info.npeel > elem->npeel))
1512 {
1513 max->peel_info.npeel = elem->npeel;
1514 max->peel_info.count = elem->count;
1515 max->peel_info.dr_info = elem->dr_info;
1516 }
1517
1518 return 1;
1519 }
1520
1521 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1522 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1523 npeel is computed at runtime but DR0_INFO's misalignment will be zero
1524 after peeling. */
1525
1526 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel)1527 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1528 dr_vec_info *dr0_info,
1529 unsigned int *inside_cost,
1530 unsigned int *outside_cost,
1531 stmt_vector_for_cost *body_cost_vec,
1532 stmt_vector_for_cost *prologue_cost_vec,
1533 unsigned int npeel)
1534 {
1535 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1536
1537 bool dr0_alignment_known_p
1538 = (dr0_info
1539 && known_alignment_for_access_p (dr0_info,
1540 STMT_VINFO_VECTYPE (dr0_info->stmt)));
1541
1542 for (data_reference *dr : datarefs)
1543 {
1544 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1545 if (!vect_relevant_for_alignment_p (dr_info))
1546 continue;
1547
1548 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1549 dr_alignment_support alignment_support_scheme;
1550 int misalignment;
1551 unsigned HOST_WIDE_INT alignment;
1552
1553 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1554 size_zero_node) < 0;
1555 poly_int64 off = 0;
1556 if (negative)
1557 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1558 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1559
1560 if (npeel == 0)
1561 misalignment = dr_misalignment (dr_info, vectype, off);
1562 else if (dr_info == dr0_info
1563 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1564 misalignment = 0;
1565 else if (!dr0_alignment_known_p
1566 || !known_alignment_for_access_p (dr_info, vectype)
1567 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1568 misalignment = DR_MISALIGNMENT_UNKNOWN;
1569 else
1570 {
1571 misalignment = dr_misalignment (dr_info, vectype, off);
1572 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1573 misalignment &= alignment - 1;
1574 }
1575 alignment_support_scheme
1576 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1577 misalignment);
1578
1579 vect_get_data_access_cost (loop_vinfo, dr_info,
1580 alignment_support_scheme, misalignment,
1581 inside_cost, outside_cost,
1582 body_cost_vec, prologue_cost_vec);
1583 }
1584 }
1585
1586 /* Traverse peeling hash table and calculate cost for each peeling option.
1587 Find the one with the lowest cost. */
1588
1589 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1590 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1591 _vect_peel_extended_info *min)
1592 {
1593 vect_peel_info elem = *slot;
1594 int dummy;
1595 unsigned int inside_cost = 0, outside_cost = 0;
1596 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1597 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1598 epilogue_cost_vec;
1599
1600 prologue_cost_vec.create (2);
1601 body_cost_vec.create (2);
1602 epilogue_cost_vec.create (2);
1603
1604 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1605 &outside_cost, &body_cost_vec,
1606 &prologue_cost_vec, elem->npeel);
1607
1608 body_cost_vec.release ();
1609
1610 outside_cost += vect_get_known_peeling_cost
1611 (loop_vinfo, elem->npeel, &dummy,
1612 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1613 &prologue_cost_vec, &epilogue_cost_vec);
1614
1615 /* Prologue and epilogue costs are added to the target model later.
1616 These costs depend only on the scalar iteration cost, the
1617 number of peeling iterations finally chosen, and the number of
1618 misaligned statements. So discard the information found here. */
1619 prologue_cost_vec.release ();
1620 epilogue_cost_vec.release ();
1621
1622 if (inside_cost < min->inside_cost
1623 || (inside_cost == min->inside_cost
1624 && outside_cost < min->outside_cost))
1625 {
1626 min->inside_cost = inside_cost;
1627 min->outside_cost = outside_cost;
1628 min->peel_info.dr_info = elem->dr_info;
1629 min->peel_info.npeel = elem->npeel;
1630 min->peel_info.count = elem->count;
1631 }
1632
1633 return 1;
1634 }
1635
1636
1637 /* Choose best peeling option by traversing peeling hash table and either
1638 choosing an option with the lowest cost (if cost model is enabled) or the
1639 option that aligns as many accesses as possible. */
1640
1641 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1642 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1643 loop_vec_info loop_vinfo)
1644 {
1645 struct _vect_peel_extended_info res;
1646
1647 res.peel_info.dr_info = NULL;
1648 res.vinfo = loop_vinfo;
1649
1650 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1651 {
1652 res.inside_cost = INT_MAX;
1653 res.outside_cost = INT_MAX;
1654 peeling_htab->traverse <_vect_peel_extended_info *,
1655 vect_peeling_hash_get_lowest_cost> (&res);
1656 }
1657 else
1658 {
1659 res.peel_info.count = 0;
1660 peeling_htab->traverse <_vect_peel_extended_info *,
1661 vect_peeling_hash_get_most_frequent> (&res);
1662 res.inside_cost = 0;
1663 res.outside_cost = 0;
1664 }
1665
1666 return res;
1667 }
1668
1669 /* Return true if the new peeling NPEEL is supported. */
1670
1671 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1672 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1673 unsigned npeel)
1674 {
1675 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1676 enum dr_alignment_support supportable_dr_alignment;
1677
1678 bool dr0_alignment_known_p
1679 = known_alignment_for_access_p (dr0_info,
1680 STMT_VINFO_VECTYPE (dr0_info->stmt));
1681
1682 /* Ensure that all data refs can be vectorized after the peel. */
1683 for (data_reference *dr : datarefs)
1684 {
1685 if (dr == dr0_info->dr)
1686 continue;
1687
1688 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1689 if (!vect_relevant_for_alignment_p (dr_info)
1690 || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1691 continue;
1692
1693 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1694 int misalignment;
1695 unsigned HOST_WIDE_INT alignment;
1696 if (!dr0_alignment_known_p
1697 || !known_alignment_for_access_p (dr_info, vectype)
1698 || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1699 misalignment = DR_MISALIGNMENT_UNKNOWN;
1700 else
1701 {
1702 misalignment = dr_misalignment (dr_info, vectype);
1703 misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1704 misalignment &= alignment - 1;
1705 }
1706 supportable_dr_alignment
1707 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1708 misalignment);
1709 if (supportable_dr_alignment == dr_unaligned_unsupported)
1710 return false;
1711 }
1712
1713 return true;
1714 }
1715
1716 /* Compare two data-references DRA and DRB to group them into chunks
1717 with related alignment. */
1718
1719 static int
dr_align_group_sort_cmp(const void * dra_,const void * drb_)1720 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1721 {
1722 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1723 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1724 int cmp;
1725
1726 /* Stabilize sort. */
1727 if (dra == drb)
1728 return 0;
1729
1730 /* Ordering of DRs according to base. */
1731 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1732 DR_BASE_ADDRESS (drb));
1733 if (cmp != 0)
1734 return cmp;
1735
1736 /* And according to DR_OFFSET. */
1737 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1738 if (cmp != 0)
1739 return cmp;
1740
1741 /* And after step. */
1742 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1743 if (cmp != 0)
1744 return cmp;
1745
1746 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
1747 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1748 if (cmp == 0)
1749 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1750 return cmp;
1751 }
1752
1753 /* Function vect_enhance_data_refs_alignment
1754
1755 This pass will use loop versioning and loop peeling in order to enhance
1756 the alignment of data references in the loop.
1757
1758 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1759 original loop is to be vectorized. Any other loops that are created by
1760 the transformations performed in this pass - are not supposed to be
1761 vectorized. This restriction will be relaxed.
1762
1763 This pass will require a cost model to guide it whether to apply peeling
1764 or versioning or a combination of the two. For example, the scheme that
1765 intel uses when given a loop with several memory accesses, is as follows:
1766 choose one memory access ('p') which alignment you want to force by doing
1767 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1768 other accesses are not necessarily aligned, or (2) use loop versioning to
1769 generate one loop in which all accesses are aligned, and another loop in
1770 which only 'p' is necessarily aligned.
1771
1772 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1773 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1774 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1775
1776 Devising a cost model is the most critical aspect of this work. It will
1777 guide us on which access to peel for, whether to use loop versioning, how
1778 many versions to create, etc. The cost model will probably consist of
1779 generic considerations as well as target specific considerations (on
1780 powerpc for example, misaligned stores are more painful than misaligned
1781 loads).
1782
1783 Here are the general steps involved in alignment enhancements:
1784
1785 -- original loop, before alignment analysis:
1786 for (i=0; i<N; i++){
1787 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1788 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1789 }
1790
1791 -- After vect_compute_data_refs_alignment:
1792 for (i=0; i<N; i++){
1793 x = q[i]; # DR_MISALIGNMENT(q) = 3
1794 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1795 }
1796
1797 -- Possibility 1: we do loop versioning:
1798 if (p is aligned) {
1799 for (i=0; i<N; i++){ # loop 1A
1800 x = q[i]; # DR_MISALIGNMENT(q) = 3
1801 p[i] = y; # DR_MISALIGNMENT(p) = 0
1802 }
1803 }
1804 else {
1805 for (i=0; i<N; i++){ # loop 1B
1806 x = q[i]; # DR_MISALIGNMENT(q) = 3
1807 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1808 }
1809 }
1810
1811 -- Possibility 2: we do loop peeling:
1812 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1813 x = q[i];
1814 p[i] = y;
1815 }
1816 for (i = 3; i < N; i++){ # loop 2A
1817 x = q[i]; # DR_MISALIGNMENT(q) = 0
1818 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1819 }
1820
1821 -- Possibility 3: combination of loop peeling and versioning:
1822 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1823 x = q[i];
1824 p[i] = y;
1825 }
1826 if (p is aligned) {
1827 for (i = 3; i<N; i++){ # loop 3A
1828 x = q[i]; # DR_MISALIGNMENT(q) = 0
1829 p[i] = y; # DR_MISALIGNMENT(p) = 0
1830 }
1831 }
1832 else {
1833 for (i = 3; i<N; i++){ # loop 3B
1834 x = q[i]; # DR_MISALIGNMENT(q) = 0
1835 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1836 }
1837 }
1838
1839 These loops are later passed to loop_transform to be vectorized. The
1840 vectorizer will use the alignment information to guide the transformation
1841 (whether to generate regular loads/stores, or with special handling for
1842 misalignment). */
1843
1844 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1845 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1846 {
1847 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1848 dr_vec_info *first_store = NULL;
1849 dr_vec_info *dr0_info = NULL;
1850 struct data_reference *dr;
1851 unsigned int i;
1852 bool do_peeling = false;
1853 bool do_versioning = false;
1854 unsigned int npeel = 0;
1855 bool one_misalignment_known = false;
1856 bool one_misalignment_unknown = false;
1857 bool one_dr_unsupportable = false;
1858 dr_vec_info *unsupportable_dr_info = NULL;
1859 unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1860 hash_table<peel_info_hasher> peeling_htab (1);
1861
1862 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1863
1864 /* Reset data so we can safely be called multiple times. */
1865 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1866 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1867
1868 if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1869 return opt_result::success ();
1870
1871 /* Sort the vector of datarefs so DRs that have the same or dependent
1872 alignment are next to each other. */
1873 auto_vec<data_reference_p> datarefs
1874 = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1875 datarefs.qsort (dr_align_group_sort_cmp);
1876
1877 /* Compute the number of DRs that become aligned when we peel
1878 a dataref so it becomes aligned. */
1879 auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1880 n_same_align_refs.quick_grow_cleared (datarefs.length ());
1881 unsigned i0;
1882 for (i0 = 0; i0 < datarefs.length (); ++i0)
1883 if (DR_BASE_ADDRESS (datarefs[i0]))
1884 break;
1885 for (i = i0 + 1; i <= datarefs.length (); ++i)
1886 {
1887 if (i == datarefs.length ()
1888 || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1889 DR_BASE_ADDRESS (datarefs[i]), 0)
1890 || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1891 DR_OFFSET (datarefs[i]), 0)
1892 || !operand_equal_p (DR_STEP (datarefs[i0]),
1893 DR_STEP (datarefs[i]), 0))
1894 {
1895 /* The subgroup [i0, i-1] now only differs in DR_INIT and
1896 possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
1897 will get known misalignment if we align one of the refs
1898 with the largest DR_TARGET_ALIGNMENT. */
1899 for (unsigned j = i0; j < i; ++j)
1900 {
1901 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1902 for (unsigned k = i0; k < i; ++k)
1903 {
1904 if (k == j)
1905 continue;
1906 dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1907 if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1908 dr_infoj))
1909 n_same_align_refs[j]++;
1910 }
1911 }
1912 i0 = i;
1913 }
1914 }
1915
1916 /* While cost model enhancements are expected in the future, the high level
1917 view of the code at this time is as follows:
1918
1919 A) If there is a misaligned access then see if peeling to align
1920 this access can make all data references satisfy
1921 vect_supportable_dr_alignment. If so, update data structures
1922 as needed and return true.
1923
1924 B) If peeling wasn't possible and there is a data reference with an
1925 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1926 then see if loop versioning checks can be used to make all data
1927 references satisfy vect_supportable_dr_alignment. If so, update
1928 data structures as needed and return true.
1929
1930 C) If neither peeling nor versioning were successful then return false if
1931 any data reference does not satisfy vect_supportable_dr_alignment.
1932
1933 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1934
1935 Note, Possibility 3 above (which is peeling and versioning together) is not
1936 being done at this time. */
1937
1938 /* (1) Peeling to force alignment. */
1939
1940 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1941 Considerations:
1942 + How many accesses will become aligned due to the peeling
1943 - How many accesses will become unaligned due to the peeling,
1944 and the cost of misaligned accesses.
1945 - The cost of peeling (the extra runtime checks, the increase
1946 in code size). */
1947
1948 FOR_EACH_VEC_ELT (datarefs, i, dr)
1949 {
1950 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1951 if (!vect_relevant_for_alignment_p (dr_info))
1952 continue;
1953
1954 stmt_vec_info stmt_info = dr_info->stmt;
1955 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1956 do_peeling = vector_alignment_reachable_p (dr_info);
1957 if (do_peeling)
1958 {
1959 if (known_alignment_for_access_p (dr_info, vectype))
1960 {
1961 unsigned int npeel_tmp = 0;
1962 bool negative = tree_int_cst_compare (DR_STEP (dr),
1963 size_zero_node) < 0;
1964
1965 /* If known_alignment_for_access_p then we have set
1966 DR_MISALIGNMENT which is only done if we know it at compiler
1967 time, so it is safe to assume target alignment is constant.
1968 */
1969 unsigned int target_align =
1970 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1971 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1972 poly_int64 off = 0;
1973 if (negative)
1974 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1975 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1976 mis = negative ? mis : -mis;
1977 if (mis != 0)
1978 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1979
1980 /* For multiple types, it is possible that the bigger type access
1981 will have more than one peeling option. E.g., a loop with two
1982 types: one of size (vector size / 4), and the other one of
1983 size (vector size / 8). Vectorization factor will 8. If both
1984 accesses are misaligned by 3, the first one needs one scalar
1985 iteration to be aligned, and the second one needs 5. But the
1986 first one will be aligned also by peeling 5 scalar
1987 iterations, and in that case both accesses will be aligned.
1988 Hence, except for the immediate peeling amount, we also want
1989 to try to add full vector size, while we don't exceed
1990 vectorization factor.
1991 We do this automatically for cost model, since we calculate
1992 cost for every peeling option. */
1993 poly_uint64 nscalars = npeel_tmp;
1994 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1995 {
1996 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997 nscalars = (STMT_SLP_TYPE (stmt_info)
1998 ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1999 }
2000
2001 /* Save info about DR in the hash table. Also include peeling
2002 amounts according to the explanation above. Indicate
2003 the alignment status when the ref is not aligned.
2004 ??? Rather than using unknown alignment here we should
2005 prune all entries from the peeling hashtable which cause
2006 DRs to be not supported. */
2007 bool supportable_if_not_aligned
2008 = vect_supportable_dr_alignment
2009 (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2010 while (known_le (npeel_tmp, nscalars))
2011 {
2012 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2013 dr_info, npeel_tmp,
2014 supportable_if_not_aligned);
2015 npeel_tmp += MAX (1, target_align / dr_size);
2016 }
2017
2018 one_misalignment_known = true;
2019 }
2020 else
2021 {
2022 /* If we don't know any misalignment values, we prefer
2023 peeling for data-ref that has the maximum number of data-refs
2024 with the same alignment, unless the target prefers to align
2025 stores over load. */
2026 unsigned same_align_drs = n_same_align_refs[i];
2027 if (!dr0_info
2028 || dr0_same_align_drs < same_align_drs)
2029 {
2030 dr0_same_align_drs = same_align_drs;
2031 dr0_info = dr_info;
2032 }
2033 /* For data-refs with the same number of related
2034 accesses prefer the one where the misalign
2035 computation will be invariant in the outermost loop. */
2036 else if (dr0_same_align_drs == same_align_drs)
2037 {
2038 class loop *ivloop0, *ivloop;
2039 ivloop0 = outermost_invariant_loop_for_expr
2040 (loop, DR_BASE_ADDRESS (dr0_info->dr));
2041 ivloop = outermost_invariant_loop_for_expr
2042 (loop, DR_BASE_ADDRESS (dr));
2043 if ((ivloop && !ivloop0)
2044 || (ivloop && ivloop0
2045 && flow_loop_nested_p (ivloop, ivloop0)))
2046 dr0_info = dr_info;
2047 }
2048
2049 one_misalignment_unknown = true;
2050
2051 /* Check for data refs with unsupportable alignment that
2052 can be peeled. */
2053 enum dr_alignment_support supportable_dr_alignment
2054 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2055 DR_MISALIGNMENT_UNKNOWN);
2056 if (supportable_dr_alignment == dr_unaligned_unsupported)
2057 {
2058 one_dr_unsupportable = true;
2059 unsupportable_dr_info = dr_info;
2060 }
2061
2062 if (!first_store && DR_IS_WRITE (dr))
2063 {
2064 first_store = dr_info;
2065 first_store_same_align_drs = same_align_drs;
2066 }
2067 }
2068 }
2069 else
2070 {
2071 if (!aligned_access_p (dr_info, vectype))
2072 {
2073 if (dump_enabled_p ())
2074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075 "vector alignment may not be reachable\n");
2076 break;
2077 }
2078 }
2079 }
2080
2081 /* Check if we can possibly peel the loop. */
2082 if (!vect_can_advance_ivs_p (loop_vinfo)
2083 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2084 || loop->inner)
2085 do_peeling = false;
2086
2087 struct _vect_peel_extended_info peel_for_known_alignment;
2088 struct _vect_peel_extended_info peel_for_unknown_alignment;
2089 struct _vect_peel_extended_info best_peel;
2090
2091 peel_for_unknown_alignment.inside_cost = INT_MAX;
2092 peel_for_unknown_alignment.outside_cost = INT_MAX;
2093 peel_for_unknown_alignment.peel_info.count = 0;
2094
2095 if (do_peeling
2096 && one_misalignment_unknown)
2097 {
2098 /* Check if the target requires to prefer stores over loads, i.e., if
2099 misaligned stores are more expensive than misaligned loads (taking
2100 drs with same alignment into account). */
2101 unsigned int load_inside_cost = 0;
2102 unsigned int load_outside_cost = 0;
2103 unsigned int store_inside_cost = 0;
2104 unsigned int store_outside_cost = 0;
2105 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2106
2107 stmt_vector_for_cost dummy;
2108 dummy.create (2);
2109 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2110 &load_inside_cost,
2111 &load_outside_cost,
2112 &dummy, &dummy, estimated_npeels);
2113 dummy.release ();
2114
2115 if (first_store)
2116 {
2117 dummy.create (2);
2118 vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2119 &store_inside_cost,
2120 &store_outside_cost,
2121 &dummy, &dummy,
2122 estimated_npeels);
2123 dummy.release ();
2124 }
2125 else
2126 {
2127 store_inside_cost = INT_MAX;
2128 store_outside_cost = INT_MAX;
2129 }
2130
2131 if (load_inside_cost > store_inside_cost
2132 || (load_inside_cost == store_inside_cost
2133 && load_outside_cost > store_outside_cost))
2134 {
2135 dr0_info = first_store;
2136 dr0_same_align_drs = first_store_same_align_drs;
2137 peel_for_unknown_alignment.inside_cost = store_inside_cost;
2138 peel_for_unknown_alignment.outside_cost = store_outside_cost;
2139 }
2140 else
2141 {
2142 peel_for_unknown_alignment.inside_cost = load_inside_cost;
2143 peel_for_unknown_alignment.outside_cost = load_outside_cost;
2144 }
2145
2146 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2147 prologue_cost_vec.create (2);
2148 epilogue_cost_vec.create (2);
2149
2150 int dummy2;
2151 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2152 (loop_vinfo, estimated_npeels, &dummy2,
2153 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2154 &prologue_cost_vec, &epilogue_cost_vec);
2155
2156 prologue_cost_vec.release ();
2157 epilogue_cost_vec.release ();
2158
2159 peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2160 }
2161
2162 peel_for_unknown_alignment.peel_info.npeel = 0;
2163 peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2164
2165 best_peel = peel_for_unknown_alignment;
2166
2167 peel_for_known_alignment.inside_cost = INT_MAX;
2168 peel_for_known_alignment.outside_cost = INT_MAX;
2169 peel_for_known_alignment.peel_info.count = 0;
2170 peel_for_known_alignment.peel_info.dr_info = NULL;
2171
2172 if (do_peeling && one_misalignment_known)
2173 {
2174 /* Peeling is possible, but there is no data access that is not supported
2175 unless aligned. So we try to choose the best possible peeling from
2176 the hash table. */
2177 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2178 (&peeling_htab, loop_vinfo);
2179 }
2180
2181 /* Compare costs of peeling for known and unknown alignment. */
2182 if (peel_for_known_alignment.peel_info.dr_info != NULL
2183 && peel_for_unknown_alignment.inside_cost
2184 >= peel_for_known_alignment.inside_cost)
2185 {
2186 best_peel = peel_for_known_alignment;
2187
2188 /* If the best peeling for known alignment has NPEEL == 0, perform no
2189 peeling at all except if there is an unsupportable dr that we can
2190 align. */
2191 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2192 do_peeling = false;
2193 }
2194
2195 /* If there is an unsupportable data ref, prefer this over all choices so far
2196 since we'd have to discard a chosen peeling except when it accidentally
2197 aligned the unsupportable data ref. */
2198 if (one_dr_unsupportable)
2199 dr0_info = unsupportable_dr_info;
2200 else if (do_peeling)
2201 {
2202 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2203 TODO: Use nopeel_outside_cost or get rid of it? */
2204 unsigned nopeel_inside_cost = 0;
2205 unsigned nopeel_outside_cost = 0;
2206
2207 stmt_vector_for_cost dummy;
2208 dummy.create (2);
2209 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2210 &nopeel_outside_cost, &dummy, &dummy, 0);
2211 dummy.release ();
2212
2213 /* Add epilogue costs. As we do not peel for alignment here, no prologue
2214 costs will be recorded. */
2215 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2216 prologue_cost_vec.create (2);
2217 epilogue_cost_vec.create (2);
2218
2219 int dummy2;
2220 nopeel_outside_cost += vect_get_known_peeling_cost
2221 (loop_vinfo, 0, &dummy2,
2222 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2223 &prologue_cost_vec, &epilogue_cost_vec);
2224
2225 prologue_cost_vec.release ();
2226 epilogue_cost_vec.release ();
2227
2228 npeel = best_peel.peel_info.npeel;
2229 dr0_info = best_peel.peel_info.dr_info;
2230
2231 /* If no peeling is not more expensive than the best peeling we
2232 have so far, don't perform any peeling. */
2233 if (nopeel_inside_cost <= best_peel.inside_cost)
2234 do_peeling = false;
2235 }
2236
2237 if (do_peeling)
2238 {
2239 stmt_vec_info stmt_info = dr0_info->stmt;
2240 if (known_alignment_for_access_p (dr0_info,
2241 STMT_VINFO_VECTYPE (stmt_info)))
2242 {
2243 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2244 size_zero_node) < 0;
2245 if (!npeel)
2246 {
2247 /* Since it's known at compile time, compute the number of
2248 iterations in the peeled loop (the peeling factor) for use in
2249 updating DR_MISALIGNMENT values. The peeling factor is the
2250 vectorization factor minus the misalignment as an element
2251 count. */
2252 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2253 poly_int64 off = 0;
2254 if (negative)
2255 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2256 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2257 unsigned int mis
2258 = dr_misalignment (dr0_info, vectype, off);
2259 mis = negative ? mis : -mis;
2260 /* If known_alignment_for_access_p then we have set
2261 DR_MISALIGNMENT which is only done if we know it at compiler
2262 time, so it is safe to assume target alignment is constant.
2263 */
2264 unsigned int target_align =
2265 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2266 npeel = ((mis & (target_align - 1))
2267 / vect_get_scalar_dr_size (dr0_info));
2268 }
2269
2270 /* For interleaved data access every iteration accesses all the
2271 members of the group, therefore we divide the number of iterations
2272 by the group size. */
2273 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2274 npeel /= DR_GROUP_SIZE (stmt_info);
2275
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_NOTE, vect_location,
2278 "Try peeling by %d\n", npeel);
2279 }
2280
2281 /* Ensure that all datarefs can be vectorized after the peel. */
2282 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2283 do_peeling = false;
2284
2285 /* Check if all datarefs are supportable and log. */
2286 if (do_peeling
2287 && npeel == 0
2288 && known_alignment_for_access_p (dr0_info,
2289 STMT_VINFO_VECTYPE (stmt_info)))
2290 return opt_result::success ();
2291
2292 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2293 if (do_peeling)
2294 {
2295 unsigned max_allowed_peel
2296 = param_vect_max_peeling_for_alignment;
2297 if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2298 max_allowed_peel = 0;
2299 if (max_allowed_peel != (unsigned)-1)
2300 {
2301 unsigned max_peel = npeel;
2302 if (max_peel == 0)
2303 {
2304 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2305 unsigned HOST_WIDE_INT target_align_c;
2306 if (target_align.is_constant (&target_align_c))
2307 max_peel =
2308 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2309 else
2310 {
2311 do_peeling = false;
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "Disable peeling, max peels set and vector"
2315 " alignment unknown\n");
2316 }
2317 }
2318 if (max_peel > max_allowed_peel)
2319 {
2320 do_peeling = false;
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "Disable peeling, max peels reached: %d\n", max_peel);
2324 }
2325 }
2326 }
2327
2328 /* Cost model #2 - if peeling may result in a remaining loop not
2329 iterating enough to be vectorized then do not peel. Since this
2330 is a cost heuristic rather than a correctness decision, use the
2331 most likely runtime value for variable vectorization factors. */
2332 if (do_peeling
2333 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2334 {
2335 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2336 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2337 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2338 < assumed_vf + max_peel)
2339 do_peeling = false;
2340 }
2341
2342 if (do_peeling)
2343 {
2344 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2345 If the misalignment of DR_i is identical to that of dr0 then set
2346 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2347 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2348 by the peeling factor times the element size of DR_i (MOD the
2349 vectorization factor times the size). Otherwise, the
2350 misalignment of DR_i must be set to unknown. */
2351 FOR_EACH_VEC_ELT (datarefs, i, dr)
2352 if (dr != dr0_info->dr)
2353 {
2354 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2355 if (!vect_relevant_for_alignment_p (dr_info))
2356 continue;
2357
2358 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2359 }
2360
2361 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2362 if (npeel)
2363 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2364 else
2365 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2366 SET_DR_MISALIGNMENT (dr0_info,
2367 vect_dr_misalign_for_aligned_access (dr0_info));
2368 if (dump_enabled_p ())
2369 {
2370 dump_printf_loc (MSG_NOTE, vect_location,
2371 "Alignment of access forced using peeling.\n");
2372 dump_printf_loc (MSG_NOTE, vect_location,
2373 "Peeling for alignment will be applied.\n");
2374 }
2375
2376 /* The inside-loop cost will be accounted for in vectorizable_load
2377 and vectorizable_store correctly with adjusted alignments.
2378 Drop the body_cst_vec on the floor here. */
2379 return opt_result::success ();
2380 }
2381 }
2382
2383 /* (2) Versioning to force alignment. */
2384
2385 /* Try versioning if:
2386 1) optimize loop for speed and the cost-model is not cheap
2387 2) there is at least one unsupported misaligned data ref with an unknown
2388 misalignment, and
2389 3) all misaligned data refs with a known misalignment are supported, and
2390 4) the number of runtime alignment checks is within reason. */
2391
2392 do_versioning
2393 = (optimize_loop_nest_for_speed_p (loop)
2394 && !loop->inner /* FORNOW */
2395 && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2396
2397 if (do_versioning)
2398 {
2399 FOR_EACH_VEC_ELT (datarefs, i, dr)
2400 {
2401 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2402 if (!vect_relevant_for_alignment_p (dr_info))
2403 continue;
2404
2405 stmt_vec_info stmt_info = dr_info->stmt;
2406 if (STMT_VINFO_STRIDED_P (stmt_info))
2407 {
2408 do_versioning = false;
2409 break;
2410 }
2411
2412 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2413 bool negative = tree_int_cst_compare (DR_STEP (dr),
2414 size_zero_node) < 0;
2415 poly_int64 off = 0;
2416 if (negative)
2417 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2418 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2419 int misalignment;
2420 if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2421 continue;
2422
2423 enum dr_alignment_support supportable_dr_alignment
2424 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2425 misalignment);
2426 if (supportable_dr_alignment == dr_unaligned_unsupported)
2427 {
2428 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2429 || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2430 >= (unsigned) param_vect_max_version_for_alignment_checks))
2431 {
2432 do_versioning = false;
2433 break;
2434 }
2435
2436 /* At present we don't support versioning for alignment
2437 with variable VF, since there's no guarantee that the
2438 VF is a power of two. We could relax this if we added
2439 a way of enforcing a power-of-two size. */
2440 unsigned HOST_WIDE_INT size;
2441 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2442 {
2443 do_versioning = false;
2444 break;
2445 }
2446
2447 /* Forcing alignment in the first iteration is no good if
2448 we don't keep it across iterations. For now, just disable
2449 versioning in this case.
2450 ?? We could actually unroll the loop to achieve the required
2451 overall step alignment, and forcing the alignment could be
2452 done by doing some iterations of the non-vectorized loop. */
2453 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2454 * DR_STEP_ALIGNMENT (dr),
2455 DR_TARGET_ALIGNMENT (dr_info)))
2456 {
2457 do_versioning = false;
2458 break;
2459 }
2460
2461 /* The rightmost bits of an aligned address must be zeros.
2462 Construct the mask needed for this test. For example,
2463 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2464 mask must be 15 = 0xf. */
2465 int mask = size - 1;
2466
2467 /* FORNOW: use the same mask to test all potentially unaligned
2468 references in the loop. */
2469 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2470 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2471 {
2472 do_versioning = false;
2473 break;
2474 }
2475
2476 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2477 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2478 }
2479 }
2480
2481 /* Versioning requires at least one misaligned data reference. */
2482 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2483 do_versioning = false;
2484 else if (!do_versioning)
2485 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2486 }
2487
2488 if (do_versioning)
2489 {
2490 const vec<stmt_vec_info> &may_misalign_stmts
2491 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2492 stmt_vec_info stmt_info;
2493
2494 /* It can now be assumed that the data references in the statements
2495 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2496 of the loop being vectorized. */
2497 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2498 {
2499 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2500 SET_DR_MISALIGNMENT (dr_info,
2501 vect_dr_misalign_for_aligned_access (dr_info));
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_NOTE, vect_location,
2504 "Alignment of access forced using versioning.\n");
2505 }
2506
2507 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Versioning for alignment will be applied.\n");
2510
2511 /* Peeling and versioning can't be done together at this time. */
2512 gcc_assert (! (do_peeling && do_versioning));
2513
2514 return opt_result::success ();
2515 }
2516
2517 /* This point is reached if neither peeling nor versioning is being done. */
2518 gcc_assert (! (do_peeling || do_versioning));
2519
2520 return opt_result::success ();
2521 }
2522
2523
2524 /* Function vect_analyze_data_refs_alignment
2525
2526 Analyze the alignment of the data-references in the loop.
2527 Return FALSE if a data reference is found that cannot be vectorized. */
2528
2529 opt_result
vect_analyze_data_refs_alignment(loop_vec_info loop_vinfo)2530 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2531 {
2532 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2533
2534 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2535 struct data_reference *dr;
2536 unsigned int i;
2537
2538 vect_record_base_alignments (loop_vinfo);
2539 FOR_EACH_VEC_ELT (datarefs, i, dr)
2540 {
2541 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2542 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2543 {
2544 if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2545 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2546 continue;
2547 vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2548 STMT_VINFO_VECTYPE (dr_info->stmt));
2549 }
2550 }
2551
2552 return opt_result::success ();
2553 }
2554
2555
2556 /* Analyze alignment of DRs of stmts in NODE. */
2557
2558 static bool
vect_slp_analyze_node_alignment(vec_info * vinfo,slp_tree node)2559 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2560 {
2561 /* Alignment is maintained in the first element of the group. */
2562 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2563 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2564 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2565 tree vectype = SLP_TREE_VECTYPE (node);
2566 poly_uint64 vector_alignment
2567 = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2568 BITS_PER_UNIT);
2569 if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2570 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2571 /* Re-analyze alignment when we're facing a vectorization with a bigger
2572 alignment requirement. */
2573 else if (known_lt (dr_info->target_alignment, vector_alignment))
2574 {
2575 poly_uint64 old_target_alignment = dr_info->target_alignment;
2576 int old_misalignment = dr_info->misalignment;
2577 vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2578 /* But keep knowledge about a smaller alignment. */
2579 if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2580 && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2581 {
2582 dr_info->target_alignment = old_target_alignment;
2583 dr_info->misalignment = old_misalignment;
2584 }
2585 }
2586 /* When we ever face unordered target alignments the first one wins in terms
2587 of analyzing and the other will become unknown in dr_misalignment. */
2588 return true;
2589 }
2590
2591 /* Function vect_slp_analyze_instance_alignment
2592
2593 Analyze the alignment of the data-references in the SLP instance.
2594 Return FALSE if a data reference is found that cannot be vectorized. */
2595
2596 bool
vect_slp_analyze_instance_alignment(vec_info * vinfo,slp_instance instance)2597 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2598 slp_instance instance)
2599 {
2600 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2601
2602 slp_tree node;
2603 unsigned i;
2604 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2605 if (! vect_slp_analyze_node_alignment (vinfo, node))
2606 return false;
2607
2608 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2609 && ! vect_slp_analyze_node_alignment
2610 (vinfo, SLP_INSTANCE_TREE (instance)))
2611 return false;
2612
2613 return true;
2614 }
2615
2616
2617 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2618 accesses of legal size, step, etc. Detect gaps, single element
2619 interleaving, and other special cases. Set grouped access info.
2620 Collect groups of strided stores for further use in SLP analysis.
2621 Worker for vect_analyze_group_access. */
2622
2623 static bool
vect_analyze_group_access_1(vec_info * vinfo,dr_vec_info * dr_info)2624 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2625 {
2626 data_reference *dr = dr_info->dr;
2627 tree step = DR_STEP (dr);
2628 tree scalar_type = TREE_TYPE (DR_REF (dr));
2629 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2630 stmt_vec_info stmt_info = dr_info->stmt;
2631 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2632 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2633 HOST_WIDE_INT dr_step = -1;
2634 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2635 bool slp_impossible = false;
2636
2637 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2638 size of the interleaving group (including gaps). */
2639 if (tree_fits_shwi_p (step))
2640 {
2641 dr_step = tree_to_shwi (step);
2642 /* Check that STEP is a multiple of type size. Otherwise there is
2643 a non-element-sized gap at the end of the group which we
2644 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2645 ??? As we can handle non-constant step fine here we should
2646 simply remove uses of DR_GROUP_GAP between the last and first
2647 element and instead rely on DR_STEP. DR_GROUP_SIZE then would
2648 simply not include that gap. */
2649 if ((dr_step % type_size) != 0)
2650 {
2651 if (dump_enabled_p ())
2652 dump_printf_loc (MSG_NOTE, vect_location,
2653 "Step %T is not a multiple of the element size"
2654 " for %T\n",
2655 step, DR_REF (dr));
2656 return false;
2657 }
2658 groupsize = absu_hwi (dr_step) / type_size;
2659 }
2660 else
2661 groupsize = 0;
2662
2663 /* Not consecutive access is possible only if it is a part of interleaving. */
2664 if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2665 {
2666 /* Check if it this DR is a part of interleaving, and is a single
2667 element of the group that is accessed in the loop. */
2668
2669 /* Gaps are supported only for loads. STEP must be a multiple of the type
2670 size. */
2671 if (DR_IS_READ (dr)
2672 && (dr_step % type_size) == 0
2673 && groupsize > 0
2674 /* This could be UINT_MAX but as we are generating code in a very
2675 inefficient way we have to cap earlier.
2676 See PR91403 for example. */
2677 && groupsize <= 4096)
2678 {
2679 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2680 DR_GROUP_SIZE (stmt_info) = groupsize;
2681 DR_GROUP_GAP (stmt_info) = groupsize - 1;
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_NOTE, vect_location,
2684 "Detected single element interleaving %T"
2685 " step %T\n",
2686 DR_REF (dr), step);
2687
2688 return true;
2689 }
2690
2691 if (dump_enabled_p ())
2692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2693 "not consecutive access %G", stmt_info->stmt);
2694
2695 if (bb_vinfo)
2696 {
2697 /* Mark the statement as unvectorizable. */
2698 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2699 return true;
2700 }
2701
2702 if (dump_enabled_p ())
2703 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2704 STMT_VINFO_STRIDED_P (stmt_info) = true;
2705 return true;
2706 }
2707
2708 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2709 {
2710 /* First stmt in the interleaving chain. Check the chain. */
2711 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2712 struct data_reference *data_ref = dr;
2713 unsigned int count = 1;
2714 tree prev_init = DR_INIT (data_ref);
2715 HOST_WIDE_INT diff, gaps = 0;
2716
2717 /* By construction, all group members have INTEGER_CST DR_INITs. */
2718 while (next)
2719 {
2720 /* We never have the same DR multiple times. */
2721 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2722 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2723
2724 data_ref = STMT_VINFO_DATA_REF (next);
2725
2726 /* All group members have the same STEP by construction. */
2727 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2728
2729 /* Check that the distance between two accesses is equal to the type
2730 size. Otherwise, we have gaps. */
2731 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2732 - TREE_INT_CST_LOW (prev_init)) / type_size;
2733 if (diff < 1 || diff > UINT_MAX)
2734 {
2735 /* For artificial testcases with array accesses with large
2736 constant indices we can run into overflow issues which
2737 can end up fooling the groupsize constraint below so
2738 check the individual gaps (which are represented as
2739 unsigned int) as well. */
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742 "interleaved access with gap larger "
2743 "than representable\n");
2744 return false;
2745 }
2746 if (diff != 1)
2747 {
2748 /* FORNOW: SLP of accesses with gaps is not supported. */
2749 slp_impossible = true;
2750 if (DR_IS_WRITE (data_ref))
2751 {
2752 if (dump_enabled_p ())
2753 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2754 "interleaved store with gaps\n");
2755 return false;
2756 }
2757
2758 gaps += diff - 1;
2759 }
2760
2761 last_accessed_element += diff;
2762
2763 /* Store the gap from the previous member of the group. If there is no
2764 gap in the access, DR_GROUP_GAP is always 1. */
2765 DR_GROUP_GAP (next) = diff;
2766
2767 prev_init = DR_INIT (data_ref);
2768 next = DR_GROUP_NEXT_ELEMENT (next);
2769 /* Count the number of data-refs in the chain. */
2770 count++;
2771 }
2772
2773 if (groupsize == 0)
2774 groupsize = count + gaps;
2775
2776 /* This could be UINT_MAX but as we are generating code in a very
2777 inefficient way we have to cap earlier. See PR78699 for example. */
2778 if (groupsize > 4096)
2779 {
2780 if (dump_enabled_p ())
2781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782 "group is too large\n");
2783 return false;
2784 }
2785
2786 /* Check that the size of the interleaving is equal to count for stores,
2787 i.e., that there are no gaps. */
2788 if (groupsize != count
2789 && !DR_IS_READ (dr))
2790 {
2791 groupsize = count;
2792 STMT_VINFO_STRIDED_P (stmt_info) = true;
2793 }
2794
2795 /* If there is a gap after the last load in the group it is the
2796 difference between the groupsize and the last accessed
2797 element.
2798 When there is no gap, this difference should be 0. */
2799 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2800
2801 DR_GROUP_SIZE (stmt_info) = groupsize;
2802 if (dump_enabled_p ())
2803 {
2804 dump_printf_loc (MSG_NOTE, vect_location,
2805 "Detected interleaving ");
2806 if (DR_IS_READ (dr))
2807 dump_printf (MSG_NOTE, "load ");
2808 else if (STMT_VINFO_STRIDED_P (stmt_info))
2809 dump_printf (MSG_NOTE, "strided store ");
2810 else
2811 dump_printf (MSG_NOTE, "store ");
2812 dump_printf (MSG_NOTE, "of size %u\n",
2813 (unsigned)groupsize);
2814 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2815 next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816 while (next)
2817 {
2818 if (DR_GROUP_GAP (next) != 1)
2819 dump_printf_loc (MSG_NOTE, vect_location,
2820 "\t<gap of %d elements>\n",
2821 DR_GROUP_GAP (next) - 1);
2822 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2823 next = DR_GROUP_NEXT_ELEMENT (next);
2824 }
2825 if (DR_GROUP_GAP (stmt_info) != 0)
2826 dump_printf_loc (MSG_NOTE, vect_location,
2827 "\t<gap of %d elements>\n",
2828 DR_GROUP_GAP (stmt_info));
2829 }
2830
2831 /* SLP: create an SLP data structure for every interleaving group of
2832 stores for further analysis in vect_analyse_slp. */
2833 if (DR_IS_WRITE (dr) && !slp_impossible)
2834 {
2835 if (loop_vinfo)
2836 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2837 if (bb_vinfo)
2838 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2839 }
2840 }
2841
2842 return true;
2843 }
2844
2845 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2846 accesses of legal size, step, etc. Detect gaps, single element
2847 interleaving, and other special cases. Set grouped access info.
2848 Collect groups of strided stores for further use in SLP analysis. */
2849
2850 static bool
vect_analyze_group_access(vec_info * vinfo,dr_vec_info * dr_info)2851 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2852 {
2853 if (!vect_analyze_group_access_1 (vinfo, dr_info))
2854 {
2855 /* Dissolve the group if present. */
2856 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2857 while (stmt_info)
2858 {
2859 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2860 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2861 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2862 stmt_info = next;
2863 }
2864 return false;
2865 }
2866 return true;
2867 }
2868
2869 /* Analyze the access pattern of the data-reference DR_INFO.
2870 In case of non-consecutive accesses call vect_analyze_group_access() to
2871 analyze groups of accesses. */
2872
2873 static bool
vect_analyze_data_ref_access(vec_info * vinfo,dr_vec_info * dr_info)2874 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2875 {
2876 data_reference *dr = dr_info->dr;
2877 tree step = DR_STEP (dr);
2878 tree scalar_type = TREE_TYPE (DR_REF (dr));
2879 stmt_vec_info stmt_info = dr_info->stmt;
2880 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2881 class loop *loop = NULL;
2882
2883 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2884 return true;
2885
2886 if (loop_vinfo)
2887 loop = LOOP_VINFO_LOOP (loop_vinfo);
2888
2889 if (loop_vinfo && !step)
2890 {
2891 if (dump_enabled_p ())
2892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893 "bad data-ref access in loop\n");
2894 return false;
2895 }
2896
2897 /* Allow loads with zero step in inner-loop vectorization. */
2898 if (loop_vinfo && integer_zerop (step))
2899 {
2900 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2901 if (!nested_in_vect_loop_p (loop, stmt_info))
2902 return DR_IS_READ (dr);
2903 /* Allow references with zero step for outer loops marked
2904 with pragma omp simd only - it guarantees absence of
2905 loop-carried dependencies between inner loop iterations. */
2906 if (loop->safelen < 2)
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_NOTE, vect_location,
2910 "zero step in inner loop of nest\n");
2911 return false;
2912 }
2913 }
2914
2915 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2916 {
2917 /* Interleaved accesses are not yet supported within outer-loop
2918 vectorization for references in the inner-loop. */
2919 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2920
2921 /* For the rest of the analysis we use the outer-loop step. */
2922 step = STMT_VINFO_DR_STEP (stmt_info);
2923 if (integer_zerop (step))
2924 {
2925 if (dump_enabled_p ())
2926 dump_printf_loc (MSG_NOTE, vect_location,
2927 "zero step in outer loop.\n");
2928 return DR_IS_READ (dr);
2929 }
2930 }
2931
2932 /* Consecutive? */
2933 if (TREE_CODE (step) == INTEGER_CST)
2934 {
2935 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2936 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2937 || (dr_step < 0
2938 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2939 {
2940 /* Mark that it is not interleaving. */
2941 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2942 return true;
2943 }
2944 }
2945
2946 if (loop && nested_in_vect_loop_p (loop, stmt_info))
2947 {
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "grouped access in outer loop.\n");
2951 return false;
2952 }
2953
2954
2955 /* Assume this is a DR handled by non-constant strided load case. */
2956 if (TREE_CODE (step) != INTEGER_CST)
2957 return (STMT_VINFO_STRIDED_P (stmt_info)
2958 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2959 || vect_analyze_group_access (vinfo, dr_info)));
2960
2961 /* Not consecutive access - check if it's a part of interleaving group. */
2962 return vect_analyze_group_access (vinfo, dr_info);
2963 }
2964
2965 /* Compare two data-references DRA and DRB to group them into chunks
2966 suitable for grouping. */
2967
2968 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2969 dr_group_sort_cmp (const void *dra_, const void *drb_)
2970 {
2971 dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2972 dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2973 data_reference_p dra = dra_info->dr;
2974 data_reference_p drb = drb_info->dr;
2975 int cmp;
2976
2977 /* Stabilize sort. */
2978 if (dra == drb)
2979 return 0;
2980
2981 /* Different group IDs lead never belong to the same group. */
2982 if (dra_info->group != drb_info->group)
2983 return dra_info->group < drb_info->group ? -1 : 1;
2984
2985 /* Ordering of DRs according to base. */
2986 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2987 DR_BASE_ADDRESS (drb));
2988 if (cmp != 0)
2989 return cmp;
2990
2991 /* And according to DR_OFFSET. */
2992 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2993 if (cmp != 0)
2994 return cmp;
2995
2996 /* Put reads before writes. */
2997 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2998 return DR_IS_READ (dra) ? -1 : 1;
2999
3000 /* Then sort after access size. */
3001 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3002 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3003 if (cmp != 0)
3004 return cmp;
3005
3006 /* And after step. */
3007 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3008 if (cmp != 0)
3009 return cmp;
3010
3011 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3012 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3013 if (cmp == 0)
3014 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3015 return cmp;
3016 }
3017
3018 /* If OP is the result of a conversion, return the unconverted value,
3019 otherwise return null. */
3020
3021 static tree
strip_conversion(tree op)3022 strip_conversion (tree op)
3023 {
3024 if (TREE_CODE (op) != SSA_NAME)
3025 return NULL_TREE;
3026 gimple *stmt = SSA_NAME_DEF_STMT (op);
3027 if (!is_gimple_assign (stmt)
3028 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3029 return NULL_TREE;
3030 return gimple_assign_rhs1 (stmt);
3031 }
3032
3033 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3034 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3035 be grouped in SLP mode. */
3036
3037 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info,bool allow_slp_p)3038 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3039 bool allow_slp_p)
3040 {
3041 if (gimple_assign_single_p (stmt1_info->stmt))
3042 return gimple_assign_single_p (stmt2_info->stmt);
3043
3044 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3045 if (call1 && gimple_call_internal_p (call1))
3046 {
3047 /* Check for two masked loads or two masked stores. */
3048 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3049 if (!call2 || !gimple_call_internal_p (call2))
3050 return false;
3051 internal_fn ifn = gimple_call_internal_fn (call1);
3052 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3053 return false;
3054 if (ifn != gimple_call_internal_fn (call2))
3055 return false;
3056
3057 /* Check that the masks are the same. Cope with casts of masks,
3058 like those created by build_mask_conversion. */
3059 tree mask1 = gimple_call_arg (call1, 2);
3060 tree mask2 = gimple_call_arg (call2, 2);
3061 if (!operand_equal_p (mask1, mask2, 0)
3062 && (ifn == IFN_MASK_STORE || !allow_slp_p))
3063 {
3064 mask1 = strip_conversion (mask1);
3065 if (!mask1)
3066 return false;
3067 mask2 = strip_conversion (mask2);
3068 if (!mask2)
3069 return false;
3070 if (!operand_equal_p (mask1, mask2, 0))
3071 return false;
3072 }
3073 return true;
3074 }
3075
3076 return false;
3077 }
3078
3079 /* Function vect_analyze_data_ref_accesses.
3080
3081 Analyze the access pattern of all the data references in the loop.
3082
3083 FORNOW: the only access pattern that is considered vectorizable is a
3084 simple step 1 (consecutive) access.
3085
3086 FORNOW: handle only arrays and pointer accesses. */
3087
3088 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo,vec<int> * dataref_groups)3089 vect_analyze_data_ref_accesses (vec_info *vinfo,
3090 vec<int> *dataref_groups)
3091 {
3092 unsigned int i;
3093 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3094
3095 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3096
3097 if (datarefs.is_empty ())
3098 return opt_result::success ();
3099
3100 /* Sort the array of datarefs to make building the interleaving chains
3101 linear. Don't modify the original vector's order, it is needed for
3102 determining what dependencies are reversed. */
3103 vec<dr_vec_info *> datarefs_copy;
3104 datarefs_copy.create (datarefs.length ());
3105 for (unsigned i = 0; i < datarefs.length (); i++)
3106 {
3107 dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3108 /* If the caller computed DR grouping use that, otherwise group by
3109 basic blocks. */
3110 if (dataref_groups)
3111 dr_info->group = (*dataref_groups)[i];
3112 else
3113 dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3114 datarefs_copy.quick_push (dr_info);
3115 }
3116 datarefs_copy.qsort (dr_group_sort_cmp);
3117 hash_set<stmt_vec_info> to_fixup;
3118
3119 /* Build the interleaving chains. */
3120 for (i = 0; i < datarefs_copy.length () - 1;)
3121 {
3122 dr_vec_info *dr_info_a = datarefs_copy[i];
3123 data_reference_p dra = dr_info_a->dr;
3124 int dra_group_id = dr_info_a->group;
3125 stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3126 stmt_vec_info lastinfo = NULL;
3127 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3128 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3129 {
3130 ++i;
3131 continue;
3132 }
3133 for (i = i + 1; i < datarefs_copy.length (); ++i)
3134 {
3135 dr_vec_info *dr_info_b = datarefs_copy[i];
3136 data_reference_p drb = dr_info_b->dr;
3137 int drb_group_id = dr_info_b->group;
3138 stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3139 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3140 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3141 break;
3142
3143 /* ??? Imperfect sorting (non-compatible types, non-modulo
3144 accesses, same accesses) can lead to a group to be artificially
3145 split here as we don't just skip over those. If it really
3146 matters we can push those to a worklist and re-iterate
3147 over them. The we can just skip ahead to the next DR here. */
3148
3149 /* DRs in a different DR group should not be put into the same
3150 interleaving group. */
3151 if (dra_group_id != drb_group_id)
3152 break;
3153
3154 /* Check that the data-refs have same first location (except init)
3155 and they are both either store or load (not load and store,
3156 not masked loads or stores). */
3157 if (DR_IS_READ (dra) != DR_IS_READ (drb)
3158 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3159 DR_BASE_ADDRESS (drb)) != 0
3160 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3161 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3162 break;
3163
3164 /* Check that the data-refs have the same constant size. */
3165 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3166 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3167 if (!tree_fits_uhwi_p (sza)
3168 || !tree_fits_uhwi_p (szb)
3169 || !tree_int_cst_equal (sza, szb))
3170 break;
3171
3172 /* Check that the data-refs have the same step. */
3173 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3174 break;
3175
3176 /* Check the types are compatible.
3177 ??? We don't distinguish this during sorting. */
3178 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3179 TREE_TYPE (DR_REF (drb))))
3180 break;
3181
3182 /* Check that the DR_INITs are compile-time constants. */
3183 if (!tree_fits_shwi_p (DR_INIT (dra))
3184 || !tree_fits_shwi_p (DR_INIT (drb)))
3185 break;
3186
3187 /* Different .GOMP_SIMD_LANE calls still give the same lane,
3188 just hold extra information. */
3189 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3190 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3191 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3192 break;
3193
3194 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3195 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3196 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3197 HOST_WIDE_INT init_prev
3198 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3199 gcc_assert (init_a <= init_b
3200 && init_a <= init_prev
3201 && init_prev <= init_b);
3202
3203 /* Do not place the same access in the interleaving chain twice. */
3204 if (init_b == init_prev)
3205 {
3206 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3207 < gimple_uid (DR_STMT (drb)));
3208 /* Simply link in duplicates and fix up the chain below. */
3209 }
3210 else
3211 {
3212 /* If init_b == init_a + the size of the type * k, we have an
3213 interleaving, and DRA is accessed before DRB. */
3214 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3215 if (type_size_a == 0
3216 || (((unsigned HOST_WIDE_INT)init_b - init_a)
3217 % type_size_a != 0))
3218 break;
3219
3220 /* If we have a store, the accesses are adjacent. This splits
3221 groups into chunks we support (we don't support vectorization
3222 of stores with gaps). */
3223 if (!DR_IS_READ (dra)
3224 && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3225 != type_size_a))
3226 break;
3227
3228 /* If the step (if not zero or non-constant) is smaller than the
3229 difference between data-refs' inits this splits groups into
3230 suitable sizes. */
3231 if (tree_fits_shwi_p (DR_STEP (dra)))
3232 {
3233 unsigned HOST_WIDE_INT step
3234 = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3235 if (step != 0
3236 && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3237 break;
3238 }
3239 }
3240
3241 if (dump_enabled_p ())
3242 dump_printf_loc (MSG_NOTE, vect_location,
3243 DR_IS_READ (dra)
3244 ? "Detected interleaving load %T and %T\n"
3245 : "Detected interleaving store %T and %T\n",
3246 DR_REF (dra), DR_REF (drb));
3247
3248 /* Link the found element into the group list. */
3249 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3250 {
3251 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3252 lastinfo = stmtinfo_a;
3253 }
3254 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3255 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3256 lastinfo = stmtinfo_b;
3257
3258 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3259 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3260
3261 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3262 dump_printf_loc (MSG_NOTE, vect_location,
3263 "Load suitable for SLP vectorization only.\n");
3264
3265 if (init_b == init_prev
3266 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3267 && dump_enabled_p ())
3268 dump_printf_loc (MSG_NOTE, vect_location,
3269 "Queuing group with duplicate access for fixup\n");
3270 }
3271 }
3272
3273 /* Fixup groups with duplicate entries by splitting it. */
3274 while (1)
3275 {
3276 hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3277 if (!(it != to_fixup.end ()))
3278 break;
3279 stmt_vec_info grp = *it;
3280 to_fixup.remove (grp);
3281
3282 /* Find the earliest duplicate group member. */
3283 unsigned first_duplicate = -1u;
3284 stmt_vec_info next, g = grp;
3285 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3286 {
3287 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3288 DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3289 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3290 first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3291 g = next;
3292 }
3293 if (first_duplicate == -1U)
3294 continue;
3295
3296 /* Then move all stmts after the first duplicate to a new group.
3297 Note this is a heuristic but one with the property that *it
3298 is fixed up completely. */
3299 g = grp;
3300 stmt_vec_info newgroup = NULL, ng = grp;
3301 while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3302 {
3303 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3304 {
3305 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3306 if (!newgroup)
3307 newgroup = next;
3308 else
3309 DR_GROUP_NEXT_ELEMENT (ng) = next;
3310 ng = next;
3311 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3312 }
3313 else
3314 g = DR_GROUP_NEXT_ELEMENT (g);
3315 }
3316 DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3317
3318 /* Fixup the new group which still may contain duplicates. */
3319 to_fixup.add (newgroup);
3320 }
3321
3322 dr_vec_info *dr_info;
3323 FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3324 {
3325 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3326 && !vect_analyze_data_ref_access (vinfo, dr_info))
3327 {
3328 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3330 "not vectorized: complicated access pattern.\n");
3331
3332 if (is_a <bb_vec_info> (vinfo))
3333 {
3334 /* Mark the statement as not vectorizable. */
3335 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3336 continue;
3337 }
3338 else
3339 {
3340 datarefs_copy.release ();
3341 return opt_result::failure_at (dr_info->stmt->stmt,
3342 "not vectorized:"
3343 " complicated access pattern.\n");
3344 }
3345 }
3346 }
3347
3348 datarefs_copy.release ();
3349 return opt_result::success ();
3350 }
3351
3352 /* Function vect_vfa_segment_size.
3353
3354 Input:
3355 DR_INFO: The data reference.
3356 LENGTH_FACTOR: segment length to consider.
3357
3358 Return a value suitable for the dr_with_seg_len::seg_len field.
3359 This is the "distance travelled" by the pointer from the first
3360 iteration in the segment to the last. Note that it does not include
3361 the size of the access; in effect it only describes the first byte. */
3362
3363 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3364 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3365 {
3366 length_factor = size_binop (MINUS_EXPR,
3367 fold_convert (sizetype, length_factor),
3368 size_one_node);
3369 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3370 length_factor);
3371 }
3372
3373 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3374 gives the worst-case number of bytes covered by the segment. */
3375
3376 static unsigned HOST_WIDE_INT
vect_vfa_access_size(vec_info * vinfo,dr_vec_info * dr_info)3377 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3378 {
3379 stmt_vec_info stmt_vinfo = dr_info->stmt;
3380 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3381 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3382 unsigned HOST_WIDE_INT access_size = ref_size;
3383 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3384 {
3385 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3386 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3387 }
3388 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3389 int misalignment;
3390 if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3391 && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3392 && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3393 == dr_explicit_realign_optimized))
3394 {
3395 /* We might access a full vector's worth. */
3396 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3397 }
3398 return access_size;
3399 }
3400
3401 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3402 describes. */
3403
3404 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3405 vect_vfa_align (dr_vec_info *dr_info)
3406 {
3407 return dr_alignment (dr_info->dr);
3408 }
3409
3410 /* Function vect_no_alias_p.
3411
3412 Given data references A and B with equal base and offset, see whether
3413 the alias relation can be decided at compilation time. Return 1 if
3414 it can and the references alias, 0 if it can and the references do
3415 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3416 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3417 of dr_with_seg_len::{seg_len,access_size} for A and B. */
3418
3419 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3420 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3421 tree segment_length_a, tree segment_length_b,
3422 unsigned HOST_WIDE_INT access_size_a,
3423 unsigned HOST_WIDE_INT access_size_b)
3424 {
3425 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3426 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3427 poly_uint64 const_length_a;
3428 poly_uint64 const_length_b;
3429
3430 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3431 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3432 [a, a+12) */
3433 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3434 {
3435 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3436 offset_a -= const_length_a;
3437 }
3438 else
3439 const_length_a = tree_to_poly_uint64 (segment_length_a);
3440 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3441 {
3442 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3443 offset_b -= const_length_b;
3444 }
3445 else
3446 const_length_b = tree_to_poly_uint64 (segment_length_b);
3447
3448 const_length_a += access_size_a;
3449 const_length_b += access_size_b;
3450
3451 if (ranges_known_overlap_p (offset_a, const_length_a,
3452 offset_b, const_length_b))
3453 return 1;
3454
3455 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3456 offset_b, const_length_b))
3457 return 0;
3458
3459 return -1;
3460 }
3461
3462 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3463 in DDR is >= VF. */
3464
3465 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3466 dependence_distance_ge_vf (data_dependence_relation *ddr,
3467 unsigned int loop_depth, poly_uint64 vf)
3468 {
3469 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3470 || DDR_NUM_DIST_VECTS (ddr) == 0)
3471 return false;
3472
3473 /* If the dependence is exact, we should have limited the VF instead. */
3474 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3475
3476 unsigned int i;
3477 lambda_vector dist_v;
3478 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3479 {
3480 HOST_WIDE_INT dist = dist_v[loop_depth];
3481 if (dist != 0
3482 && !(dist > 0 && DDR_REVERSED_P (ddr))
3483 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3484 return false;
3485 }
3486
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "dependence distance between %T and %T is >= VF\n",
3490 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3491
3492 return true;
3493 }
3494
3495 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3496
3497 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3498 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3499 {
3500 dump_printf (dump_kind, "%s (%T) >= ",
3501 lower_bound.unsigned_p ? "unsigned" : "abs",
3502 lower_bound.expr);
3503 dump_dec (dump_kind, lower_bound.min_value);
3504 }
3505
3506 /* Record that the vectorized loop requires the vec_lower_bound described
3507 by EXPR, UNSIGNED_P and MIN_VALUE. */
3508
3509 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3510 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3511 poly_uint64 min_value)
3512 {
3513 vec<vec_lower_bound> &lower_bounds
3514 = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3515 for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3516 if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3517 {
3518 unsigned_p &= lower_bounds[i].unsigned_p;
3519 min_value = upper_bound (lower_bounds[i].min_value, min_value);
3520 if (lower_bounds[i].unsigned_p != unsigned_p
3521 || maybe_lt (lower_bounds[i].min_value, min_value))
3522 {
3523 lower_bounds[i].unsigned_p = unsigned_p;
3524 lower_bounds[i].min_value = min_value;
3525 if (dump_enabled_p ())
3526 {
3527 dump_printf_loc (MSG_NOTE, vect_location,
3528 "updating run-time check to ");
3529 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3530 dump_printf (MSG_NOTE, "\n");
3531 }
3532 }
3533 return;
3534 }
3535
3536 vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3537 if (dump_enabled_p ())
3538 {
3539 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3540 dump_lower_bound (MSG_NOTE, lower_bound);
3541 dump_printf (MSG_NOTE, "\n");
3542 }
3543 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3544 }
3545
3546 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3547 will span fewer than GAP bytes. */
3548
3549 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3550 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3551 poly_int64 gap)
3552 {
3553 stmt_vec_info stmt_info = dr_info->stmt;
3554 HOST_WIDE_INT count
3555 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3556 if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3557 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3558 return (estimated_poly_value (gap)
3559 <= count * vect_get_scalar_dr_size (dr_info));
3560 }
3561
3562 /* Return true if we know that there is no alias between DR_INFO_A and
3563 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3564 When returning true, set *LOWER_BOUND_OUT to this N. */
3565
3566 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3567 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3568 poly_uint64 *lower_bound_out)
3569 {
3570 /* Check that there is a constant gap of known sign between DR_A
3571 and DR_B. */
3572 data_reference *dr_a = dr_info_a->dr;
3573 data_reference *dr_b = dr_info_b->dr;
3574 poly_int64 init_a, init_b;
3575 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3576 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3577 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3578 || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3579 || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3580 || !ordered_p (init_a, init_b))
3581 return false;
3582
3583 /* Sort DR_A and DR_B by the address they access. */
3584 if (maybe_lt (init_b, init_a))
3585 {
3586 std::swap (init_a, init_b);
3587 std::swap (dr_info_a, dr_info_b);
3588 std::swap (dr_a, dr_b);
3589 }
3590
3591 /* If the two accesses could be dependent within a scalar iteration,
3592 make sure that we'd retain their order. */
3593 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3594 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3595 return false;
3596
3597 /* There is no alias if abs (DR_STEP) is greater than or equal to
3598 the bytes spanned by the combination of the two accesses. */
3599 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3600 return true;
3601 }
3602
3603 /* Function vect_prune_runtime_alias_test_list.
3604
3605 Prune a list of ddrs to be tested at run-time by versioning for alias.
3606 Merge several alias checks into one if possible.
3607 Return FALSE if resulting list of ddrs is longer then allowed by
3608 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3609
3610 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3611 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3612 {
3613 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3614 hash_set <tree_pair_hash> compared_objects;
3615
3616 const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3617 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3618 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3619 const vec<vec_object_pair> &check_unequal_addrs
3620 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3621 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3622 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3623
3624 ddr_p ddr;
3625 unsigned int i;
3626 tree length_factor;
3627
3628 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3629
3630 /* Step values are irrelevant for aliasing if the number of vector
3631 iterations is equal to the number of scalar iterations (which can
3632 happen for fully-SLP loops). */
3633 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3634
3635 if (!vf_one_p)
3636 {
3637 /* Convert the checks for nonzero steps into bound tests. */
3638 tree value;
3639 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3640 vect_check_lower_bound (loop_vinfo, value, true, 1);
3641 }
3642
3643 if (may_alias_ddrs.is_empty ())
3644 return opt_result::success ();
3645
3646 comp_alias_ddrs.create (may_alias_ddrs.length ());
3647
3648 unsigned int loop_depth
3649 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3650 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3651
3652 /* First, we collect all data ref pairs for aliasing checks. */
3653 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3654 {
3655 poly_uint64 lower_bound;
3656 tree segment_length_a, segment_length_b;
3657 unsigned HOST_WIDE_INT access_size_a, access_size_b;
3658 unsigned int align_a, align_b;
3659
3660 /* Ignore the alias if the VF we chose ended up being no greater
3661 than the dependence distance. */
3662 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3663 continue;
3664
3665 if (DDR_OBJECT_A (ddr))
3666 {
3667 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3668 if (!compared_objects.add (new_pair))
3669 {
3670 if (dump_enabled_p ())
3671 dump_printf_loc (MSG_NOTE, vect_location,
3672 "checking that %T and %T"
3673 " have different addresses\n",
3674 new_pair.first, new_pair.second);
3675 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3676 }
3677 continue;
3678 }
3679
3680 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3681 stmt_vec_info stmt_info_a = dr_info_a->stmt;
3682
3683 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3684 stmt_vec_info stmt_info_b = dr_info_b->stmt;
3685
3686 bool preserves_scalar_order_p
3687 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3688 bool ignore_step_p
3689 = (vf_one_p
3690 && (preserves_scalar_order_p
3691 || operand_equal_p (DR_STEP (dr_info_a->dr),
3692 DR_STEP (dr_info_b->dr))));
3693
3694 /* Skip the pair if inter-iteration dependencies are irrelevant
3695 and intra-iteration dependencies are guaranteed to be honored. */
3696 if (ignore_step_p
3697 && (preserves_scalar_order_p
3698 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3699 &lower_bound)))
3700 {
3701 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_NOTE, vect_location,
3703 "no need for alias check between "
3704 "%T and %T when VF is 1\n",
3705 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3706 continue;
3707 }
3708
3709 /* See whether we can handle the alias using a bounds check on
3710 the step, and whether that's likely to be the best approach.
3711 (It might not be, for example, if the minimum step is much larger
3712 than the number of bytes handled by one vector iteration.) */
3713 if (!ignore_step_p
3714 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3715 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3716 &lower_bound)
3717 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3718 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3719 {
3720 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3721 if (dump_enabled_p ())
3722 {
3723 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3724 "%T and %T when the step %T is outside ",
3725 DR_REF (dr_info_a->dr),
3726 DR_REF (dr_info_b->dr),
3727 DR_STEP (dr_info_a->dr));
3728 if (unsigned_p)
3729 dump_printf (MSG_NOTE, "[0");
3730 else
3731 {
3732 dump_printf (MSG_NOTE, "(");
3733 dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3734 }
3735 dump_printf (MSG_NOTE, ", ");
3736 dump_dec (MSG_NOTE, lower_bound);
3737 dump_printf (MSG_NOTE, ")\n");
3738 }
3739 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3740 unsigned_p, lower_bound);
3741 continue;
3742 }
3743
3744 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3745 if (dr_group_first_a)
3746 {
3747 stmt_info_a = dr_group_first_a;
3748 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3749 }
3750
3751 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3752 if (dr_group_first_b)
3753 {
3754 stmt_info_b = dr_group_first_b;
3755 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3756 }
3757
3758 if (ignore_step_p)
3759 {
3760 segment_length_a = size_zero_node;
3761 segment_length_b = size_zero_node;
3762 }
3763 else
3764 {
3765 if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3766 DR_STEP (dr_info_b->dr), 0))
3767 length_factor = scalar_loop_iters;
3768 else
3769 length_factor = size_int (vect_factor);
3770 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3771 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3772 }
3773 access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3774 access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3775 align_a = vect_vfa_align (dr_info_a);
3776 align_b = vect_vfa_align (dr_info_b);
3777
3778 /* See whether the alias is known at compilation time. */
3779 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3780 DR_BASE_ADDRESS (dr_info_b->dr), 0)
3781 && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3782 DR_OFFSET (dr_info_b->dr), 0)
3783 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3784 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3785 && poly_int_tree_p (segment_length_a)
3786 && poly_int_tree_p (segment_length_b))
3787 {
3788 int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3789 segment_length_a,
3790 segment_length_b,
3791 access_size_a,
3792 access_size_b);
3793 if (res >= 0 && dump_enabled_p ())
3794 {
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "can tell at compile time that %T and %T",
3797 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3798 if (res == 0)
3799 dump_printf (MSG_NOTE, " do not alias\n");
3800 else
3801 dump_printf (MSG_NOTE, " alias\n");
3802 }
3803
3804 if (res == 0)
3805 continue;
3806
3807 if (res == 1)
3808 return opt_result::failure_at (stmt_info_b->stmt,
3809 "not vectorized:"
3810 " compilation time alias: %G%G",
3811 stmt_info_a->stmt,
3812 stmt_info_b->stmt);
3813 }
3814
3815 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3816 access_size_a, align_a);
3817 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3818 access_size_b, align_b);
3819 /* Canonicalize the order to be the one that's needed for accurate
3820 RAW, WAR and WAW flags, in cases where the data references are
3821 well-ordered. The order doesn't really matter otherwise,
3822 but we might as well be consistent. */
3823 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3824 std::swap (dr_a, dr_b);
3825
3826 dr_with_seg_len_pair_t dr_with_seg_len_pair
3827 (dr_a, dr_b, (preserves_scalar_order_p
3828 ? dr_with_seg_len_pair_t::WELL_ORDERED
3829 : dr_with_seg_len_pair_t::REORDERED));
3830
3831 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3832 }
3833
3834 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3835
3836 unsigned int count = (comp_alias_ddrs.length ()
3837 + check_unequal_addrs.length ());
3838
3839 if (count
3840 && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3841 == VECT_COST_MODEL_VERY_CHEAP))
3842 return opt_result::failure_at
3843 (vect_location, "would need a runtime alias check\n");
3844
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_NOTE, vect_location,
3847 "improved number of alias checks from %d to %d\n",
3848 may_alias_ddrs.length (), count);
3849 unsigned limit = param_vect_max_version_for_alias_checks;
3850 if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3851 limit = param_vect_max_version_for_alias_checks * 6 / 10;
3852 if (count > limit)
3853 return opt_result::failure_at
3854 (vect_location,
3855 "number of versioning for alias run-time tests exceeds %d "
3856 "(--param vect-max-version-for-alias-checks)\n", limit);
3857
3858 return opt_result::success ();
3859 }
3860
3861 /* Check whether we can use an internal function for a gather load
3862 or scatter store. READ_P is true for loads and false for stores.
3863 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3864 the type of the memory elements being loaded or stored. OFFSET_TYPE
3865 is the type of the offset that is being applied to the invariant
3866 base address. SCALE is the amount by which the offset should
3867 be multiplied *after* it has been converted to address width.
3868
3869 Return true if the function is supported, storing the function id in
3870 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */
3871
3872 bool
vect_gather_scatter_fn_p(vec_info * vinfo,bool read_p,bool masked_p,tree vectype,tree memory_type,tree offset_type,int scale,internal_fn * ifn_out,tree * offset_vectype_out)3873 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3874 tree vectype, tree memory_type, tree offset_type,
3875 int scale, internal_fn *ifn_out,
3876 tree *offset_vectype_out)
3877 {
3878 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3879 unsigned int element_bits = vector_element_bits (vectype);
3880 if (element_bits != memory_bits)
3881 /* For now the vector elements must be the same width as the
3882 memory elements. */
3883 return false;
3884
3885 /* Work out which function we need. */
3886 internal_fn ifn, alt_ifn;
3887 if (read_p)
3888 {
3889 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3890 alt_ifn = IFN_MASK_GATHER_LOAD;
3891 }
3892 else
3893 {
3894 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3895 alt_ifn = IFN_MASK_SCATTER_STORE;
3896 }
3897
3898 for (;;)
3899 {
3900 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3901 if (!offset_vectype)
3902 return false;
3903
3904 /* Test whether the target supports this combination. */
3905 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3906 offset_vectype, scale))
3907 {
3908 *ifn_out = ifn;
3909 *offset_vectype_out = offset_vectype;
3910 return true;
3911 }
3912 else if (!masked_p
3913 && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3914 memory_type,
3915 offset_vectype,
3916 scale))
3917 {
3918 *ifn_out = alt_ifn;
3919 *offset_vectype_out = offset_vectype;
3920 return true;
3921 }
3922
3923 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3924 && TYPE_PRECISION (offset_type) >= element_bits)
3925 return false;
3926
3927 offset_type = build_nonstandard_integer_type
3928 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3929 }
3930 }
3931
3932 /* STMT_INFO is a call to an internal gather load or scatter store function.
3933 Describe the operation in INFO. */
3934
3935 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3936 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3937 gather_scatter_info *info)
3938 {
3939 gcall *call = as_a <gcall *> (stmt_info->stmt);
3940 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3941 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3942
3943 info->ifn = gimple_call_internal_fn (call);
3944 info->decl = NULL_TREE;
3945 info->base = gimple_call_arg (call, 0);
3946 info->offset = gimple_call_arg (call, 1);
3947 info->offset_dt = vect_unknown_def_type;
3948 info->offset_vectype = NULL_TREE;
3949 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3950 info->element_type = TREE_TYPE (vectype);
3951 info->memory_type = TREE_TYPE (DR_REF (dr));
3952 }
3953
3954 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3955 gather load or scatter store. Describe the operation in *INFO if so. */
3956
3957 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3958 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3959 gather_scatter_info *info)
3960 {
3961 HOST_WIDE_INT scale = 1;
3962 poly_int64 pbitpos, pbitsize;
3963 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3964 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3965 tree offtype = NULL_TREE;
3966 tree decl = NULL_TREE, base, off;
3967 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3968 tree memory_type = TREE_TYPE (DR_REF (dr));
3969 machine_mode pmode;
3970 int punsignedp, reversep, pvolatilep = 0;
3971 internal_fn ifn;
3972 tree offset_vectype;
3973 bool masked_p = false;
3974
3975 /* See whether this is already a call to a gather/scatter internal function.
3976 If not, see whether it's a masked load or store. */
3977 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3978 if (call && gimple_call_internal_p (call))
3979 {
3980 ifn = gimple_call_internal_fn (call);
3981 if (internal_gather_scatter_fn_p (ifn))
3982 {
3983 vect_describe_gather_scatter_call (stmt_info, info);
3984 return true;
3985 }
3986 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3987 }
3988
3989 /* True if we should aim to use internal functions rather than
3990 built-in functions. */
3991 bool use_ifn_p = (DR_IS_READ (dr)
3992 ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3993 : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3994
3995 base = DR_REF (dr);
3996 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3997 see if we can use the def stmt of the address. */
3998 if (masked_p
3999 && TREE_CODE (base) == MEM_REF
4000 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4001 && integer_zerop (TREE_OPERAND (base, 1))
4002 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4003 {
4004 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4005 if (is_gimple_assign (def_stmt)
4006 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4007 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4008 }
4009
4010 /* The gather and scatter builtins need address of the form
4011 loop_invariant + vector * {1, 2, 4, 8}
4012 or
4013 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4014 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4015 of loop invariants/SSA_NAMEs defined in the loop, with casts,
4016 multiplications and additions in it. To get a vector, we need
4017 a single SSA_NAME that will be defined in the loop and will
4018 contain everything that is not loop invariant and that can be
4019 vectorized. The following code attempts to find such a preexistng
4020 SSA_NAME OFF and put the loop invariants into a tree BASE
4021 that can be gimplified before the loop. */
4022 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4023 &punsignedp, &reversep, &pvolatilep);
4024 if (reversep)
4025 return false;
4026
4027 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4028
4029 if (TREE_CODE (base) == MEM_REF)
4030 {
4031 if (!integer_zerop (TREE_OPERAND (base, 1)))
4032 {
4033 if (off == NULL_TREE)
4034 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4035 else
4036 off = size_binop (PLUS_EXPR, off,
4037 fold_convert (sizetype, TREE_OPERAND (base, 1)));
4038 }
4039 base = TREE_OPERAND (base, 0);
4040 }
4041 else
4042 base = build_fold_addr_expr (base);
4043
4044 if (off == NULL_TREE)
4045 off = size_zero_node;
4046
4047 /* If base is not loop invariant, either off is 0, then we start with just
4048 the constant offset in the loop invariant BASE and continue with base
4049 as OFF, otherwise give up.
4050 We could handle that case by gimplifying the addition of base + off
4051 into some SSA_NAME and use that as off, but for now punt. */
4052 if (!expr_invariant_in_loop_p (loop, base))
4053 {
4054 if (!integer_zerop (off))
4055 return false;
4056 off = base;
4057 base = size_int (pbytepos);
4058 }
4059 /* Otherwise put base + constant offset into the loop invariant BASE
4060 and continue with OFF. */
4061 else
4062 {
4063 base = fold_convert (sizetype, base);
4064 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4065 }
4066
4067 /* OFF at this point may be either a SSA_NAME or some tree expression
4068 from get_inner_reference. Try to peel off loop invariants from it
4069 into BASE as long as possible. */
4070 STRIP_NOPS (off);
4071 while (offtype == NULL_TREE)
4072 {
4073 enum tree_code code;
4074 tree op0, op1, add = NULL_TREE;
4075
4076 if (TREE_CODE (off) == SSA_NAME)
4077 {
4078 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4079
4080 if (expr_invariant_in_loop_p (loop, off))
4081 return false;
4082
4083 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4084 break;
4085
4086 op0 = gimple_assign_rhs1 (def_stmt);
4087 code = gimple_assign_rhs_code (def_stmt);
4088 op1 = gimple_assign_rhs2 (def_stmt);
4089 }
4090 else
4091 {
4092 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4093 return false;
4094 code = TREE_CODE (off);
4095 extract_ops_from_tree (off, &code, &op0, &op1);
4096 }
4097 switch (code)
4098 {
4099 case POINTER_PLUS_EXPR:
4100 case PLUS_EXPR:
4101 if (expr_invariant_in_loop_p (loop, op0))
4102 {
4103 add = op0;
4104 off = op1;
4105 do_add:
4106 add = fold_convert (sizetype, add);
4107 if (scale != 1)
4108 add = size_binop (MULT_EXPR, add, size_int (scale));
4109 base = size_binop (PLUS_EXPR, base, add);
4110 continue;
4111 }
4112 if (expr_invariant_in_loop_p (loop, op1))
4113 {
4114 add = op1;
4115 off = op0;
4116 goto do_add;
4117 }
4118 break;
4119 case MINUS_EXPR:
4120 if (expr_invariant_in_loop_p (loop, op1))
4121 {
4122 add = fold_convert (sizetype, op1);
4123 add = size_binop (MINUS_EXPR, size_zero_node, add);
4124 off = op0;
4125 goto do_add;
4126 }
4127 break;
4128 case MULT_EXPR:
4129 if (scale == 1 && tree_fits_shwi_p (op1))
4130 {
4131 int new_scale = tree_to_shwi (op1);
4132 /* Only treat this as a scaling operation if the target
4133 supports it for at least some offset type. */
4134 if (use_ifn_p
4135 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4136 masked_p, vectype, memory_type,
4137 signed_char_type_node,
4138 new_scale, &ifn,
4139 &offset_vectype)
4140 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4141 masked_p, vectype, memory_type,
4142 unsigned_char_type_node,
4143 new_scale, &ifn,
4144 &offset_vectype))
4145 break;
4146 scale = new_scale;
4147 off = op0;
4148 continue;
4149 }
4150 break;
4151 case SSA_NAME:
4152 off = op0;
4153 continue;
4154 CASE_CONVERT:
4155 if (!POINTER_TYPE_P (TREE_TYPE (op0))
4156 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4157 break;
4158
4159 /* Don't include the conversion if the target is happy with
4160 the current offset type. */
4161 if (use_ifn_p
4162 && !POINTER_TYPE_P (TREE_TYPE (off))
4163 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4164 masked_p, vectype, memory_type,
4165 TREE_TYPE (off), scale, &ifn,
4166 &offset_vectype))
4167 break;
4168
4169 if (TYPE_PRECISION (TREE_TYPE (op0))
4170 == TYPE_PRECISION (TREE_TYPE (off)))
4171 {
4172 off = op0;
4173 continue;
4174 }
4175
4176 /* Include the conversion if it is widening and we're using
4177 the IFN path or the target can handle the converted from
4178 offset or the current size is not already the same as the
4179 data vector element size. */
4180 if ((TYPE_PRECISION (TREE_TYPE (op0))
4181 < TYPE_PRECISION (TREE_TYPE (off)))
4182 && (use_ifn_p
4183 || (DR_IS_READ (dr)
4184 ? (targetm.vectorize.builtin_gather
4185 && targetm.vectorize.builtin_gather (vectype,
4186 TREE_TYPE (op0),
4187 scale))
4188 : (targetm.vectorize.builtin_scatter
4189 && targetm.vectorize.builtin_scatter (vectype,
4190 TREE_TYPE (op0),
4191 scale)))
4192 || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4193 TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4194 {
4195 off = op0;
4196 offtype = TREE_TYPE (off);
4197 STRIP_NOPS (off);
4198 continue;
4199 }
4200 break;
4201 default:
4202 break;
4203 }
4204 break;
4205 }
4206
4207 /* If at the end OFF still isn't a SSA_NAME or isn't
4208 defined in the loop, punt. */
4209 if (TREE_CODE (off) != SSA_NAME
4210 || expr_invariant_in_loop_p (loop, off))
4211 return false;
4212
4213 if (offtype == NULL_TREE)
4214 offtype = TREE_TYPE (off);
4215
4216 if (use_ifn_p)
4217 {
4218 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4219 vectype, memory_type, offtype, scale,
4220 &ifn, &offset_vectype))
4221 ifn = IFN_LAST;
4222 decl = NULL_TREE;
4223 }
4224 else
4225 {
4226 if (DR_IS_READ (dr))
4227 {
4228 if (targetm.vectorize.builtin_gather)
4229 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4230 }
4231 else
4232 {
4233 if (targetm.vectorize.builtin_scatter)
4234 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4235 }
4236 ifn = IFN_LAST;
4237 /* The offset vector type will be read from DECL when needed. */
4238 offset_vectype = NULL_TREE;
4239 }
4240
4241 info->ifn = ifn;
4242 info->decl = decl;
4243 info->base = base;
4244 info->offset = off;
4245 info->offset_dt = vect_unknown_def_type;
4246 info->offset_vectype = offset_vectype;
4247 info->scale = scale;
4248 info->element_type = TREE_TYPE (vectype);
4249 info->memory_type = memory_type;
4250 return true;
4251 }
4252
4253 /* Find the data references in STMT, analyze them with respect to LOOP and
4254 append them to DATAREFS. Return false if datarefs in this stmt cannot
4255 be handled. */
4256
4257 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs,vec<int> * dataref_groups,int group_id)4258 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4259 vec<data_reference_p> *datarefs,
4260 vec<int> *dataref_groups, int group_id)
4261 {
4262 /* We can ignore clobbers for dataref analysis - they are removed during
4263 loop vectorization and BB vectorization checks dependences with a
4264 stmt walk. */
4265 if (gimple_clobber_p (stmt))
4266 return opt_result::success ();
4267
4268 if (gimple_has_volatile_ops (stmt))
4269 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4270 stmt);
4271
4272 if (stmt_can_throw_internal (cfun, stmt))
4273 return opt_result::failure_at (stmt,
4274 "not vectorized:"
4275 " statement can throw an exception: %G",
4276 stmt);
4277
4278 auto_vec<data_reference_p, 2> refs;
4279 opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4280 if (!res)
4281 return res;
4282
4283 if (refs.is_empty ())
4284 return opt_result::success ();
4285
4286 if (refs.length () > 1)
4287 {
4288 while (!refs.is_empty ())
4289 free_data_ref (refs.pop ());
4290 return opt_result::failure_at (stmt,
4291 "not vectorized: more than one "
4292 "data ref in stmt: %G", stmt);
4293 }
4294
4295 data_reference_p dr = refs.pop ();
4296 if (gcall *call = dyn_cast <gcall *> (stmt))
4297 if (!gimple_call_internal_p (call)
4298 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4299 && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4300 {
4301 free_data_ref (dr);
4302 return opt_result::failure_at (stmt,
4303 "not vectorized: dr in a call %G", stmt);
4304 }
4305
4306 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4307 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4308 {
4309 free_data_ref (dr);
4310 return opt_result::failure_at (stmt,
4311 "not vectorized:"
4312 " statement is bitfield access %G", stmt);
4313 }
4314
4315 if (DR_BASE_ADDRESS (dr)
4316 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4317 {
4318 free_data_ref (dr);
4319 return opt_result::failure_at (stmt,
4320 "not vectorized:"
4321 " base addr of dr is a constant\n");
4322 }
4323
4324 /* Check whether this may be a SIMD lane access and adjust the
4325 DR to make it easier for us to handle it. */
4326 if (loop
4327 && loop->simduid
4328 && (!DR_BASE_ADDRESS (dr)
4329 || !DR_OFFSET (dr)
4330 || !DR_INIT (dr)
4331 || !DR_STEP (dr)))
4332 {
4333 struct data_reference *newdr
4334 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4335 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4336 if (DR_BASE_ADDRESS (newdr)
4337 && DR_OFFSET (newdr)
4338 && DR_INIT (newdr)
4339 && DR_STEP (newdr)
4340 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4341 && integer_zerop (DR_STEP (newdr)))
4342 {
4343 tree base_address = DR_BASE_ADDRESS (newdr);
4344 tree off = DR_OFFSET (newdr);
4345 tree step = ssize_int (1);
4346 if (integer_zerop (off)
4347 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4348 {
4349 off = TREE_OPERAND (base_address, 1);
4350 base_address = TREE_OPERAND (base_address, 0);
4351 }
4352 STRIP_NOPS (off);
4353 if (TREE_CODE (off) == MULT_EXPR
4354 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4355 {
4356 step = TREE_OPERAND (off, 1);
4357 off = TREE_OPERAND (off, 0);
4358 STRIP_NOPS (off);
4359 }
4360 if (CONVERT_EXPR_P (off)
4361 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4362 < TYPE_PRECISION (TREE_TYPE (off))))
4363 off = TREE_OPERAND (off, 0);
4364 if (TREE_CODE (off) == SSA_NAME)
4365 {
4366 gimple *def = SSA_NAME_DEF_STMT (off);
4367 /* Look through widening conversion. */
4368 if (is_gimple_assign (def)
4369 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4370 {
4371 tree rhs1 = gimple_assign_rhs1 (def);
4372 if (TREE_CODE (rhs1) == SSA_NAME
4373 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4374 && (TYPE_PRECISION (TREE_TYPE (off))
4375 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4376 def = SSA_NAME_DEF_STMT (rhs1);
4377 }
4378 if (is_gimple_call (def)
4379 && gimple_call_internal_p (def)
4380 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4381 {
4382 tree arg = gimple_call_arg (def, 0);
4383 tree reft = TREE_TYPE (DR_REF (newdr));
4384 gcc_assert (TREE_CODE (arg) == SSA_NAME);
4385 arg = SSA_NAME_VAR (arg);
4386 if (arg == loop->simduid
4387 /* For now. */
4388 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4389 {
4390 DR_BASE_ADDRESS (newdr) = base_address;
4391 DR_OFFSET (newdr) = ssize_int (0);
4392 DR_STEP (newdr) = step;
4393 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4394 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4395 /* Mark as simd-lane access. */
4396 tree arg2 = gimple_call_arg (def, 1);
4397 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4398 free_data_ref (dr);
4399 datarefs->safe_push (newdr);
4400 if (dataref_groups)
4401 dataref_groups->safe_push (group_id);
4402 return opt_result::success ();
4403 }
4404 }
4405 }
4406 }
4407 free_data_ref (newdr);
4408 }
4409
4410 datarefs->safe_push (dr);
4411 if (dataref_groups)
4412 dataref_groups->safe_push (group_id);
4413 return opt_result::success ();
4414 }
4415
4416 /* Function vect_analyze_data_refs.
4417
4418 Find all the data references in the loop or basic block.
4419
4420 The general structure of the analysis of data refs in the vectorizer is as
4421 follows:
4422 1- vect_analyze_data_refs(loop/bb): call
4423 compute_data_dependences_for_loop/bb to find and analyze all data-refs
4424 in the loop/bb and their dependences.
4425 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4426 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4427 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4428
4429 */
4430
4431 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf,bool * fatal)4432 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4433 {
4434 class loop *loop = NULL;
4435 unsigned int i;
4436 struct data_reference *dr;
4437 tree scalar_type;
4438
4439 DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4440
4441 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4442 loop = LOOP_VINFO_LOOP (loop_vinfo);
4443
4444 /* Go through the data-refs, check that the analysis succeeded. Update
4445 pointer from stmt_vec_info struct to DR and vectype. */
4446
4447 vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4448 FOR_EACH_VEC_ELT (datarefs, i, dr)
4449 {
4450 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4451 poly_uint64 vf;
4452
4453 gcc_assert (DR_REF (dr));
4454 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4455 gcc_assert (!stmt_info->dr_aux.dr);
4456 stmt_info->dr_aux.dr = dr;
4457 stmt_info->dr_aux.stmt = stmt_info;
4458
4459 /* Check that analysis of the data-ref succeeded. */
4460 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4461 || !DR_STEP (dr))
4462 {
4463 bool maybe_gather
4464 = DR_IS_READ (dr)
4465 && !TREE_THIS_VOLATILE (DR_REF (dr));
4466 bool maybe_scatter
4467 = DR_IS_WRITE (dr)
4468 && !TREE_THIS_VOLATILE (DR_REF (dr))
4469 && (targetm.vectorize.builtin_scatter != NULL
4470 || supports_vec_scatter_store_p ());
4471
4472 /* If target supports vector gather loads or scatter stores,
4473 see if they can't be used. */
4474 if (is_a <loop_vec_info> (vinfo)
4475 && !nested_in_vect_loop_p (loop, stmt_info))
4476 {
4477 if (maybe_gather || maybe_scatter)
4478 {
4479 if (maybe_gather)
4480 gatherscatter = GATHER;
4481 else
4482 gatherscatter = SCATTER;
4483 }
4484 }
4485
4486 if (gatherscatter == SG_NONE)
4487 {
4488 if (dump_enabled_p ())
4489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4490 "not vectorized: data ref analysis "
4491 "failed %G", stmt_info->stmt);
4492 if (is_a <bb_vec_info> (vinfo))
4493 {
4494 /* In BB vectorization the ref can still participate
4495 in dependence analysis, we just can't vectorize it. */
4496 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4497 continue;
4498 }
4499 return opt_result::failure_at (stmt_info->stmt,
4500 "not vectorized:"
4501 " data ref analysis failed: %G",
4502 stmt_info->stmt);
4503 }
4504 }
4505
4506 /* See if this was detected as SIMD lane access. */
4507 if (dr->aux == (void *)-1
4508 || dr->aux == (void *)-2
4509 || dr->aux == (void *)-3
4510 || dr->aux == (void *)-4)
4511 {
4512 if (nested_in_vect_loop_p (loop, stmt_info))
4513 return opt_result::failure_at (stmt_info->stmt,
4514 "not vectorized:"
4515 " data ref analysis failed: %G",
4516 stmt_info->stmt);
4517 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4518 = -(uintptr_t) dr->aux;
4519 }
4520
4521 tree base = get_base_address (DR_REF (dr));
4522 if (base && VAR_P (base) && DECL_NONALIASED (base))
4523 {
4524 if (dump_enabled_p ())
4525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4526 "not vectorized: base object not addressable "
4527 "for stmt: %G", stmt_info->stmt);
4528 if (is_a <bb_vec_info> (vinfo))
4529 {
4530 /* In BB vectorization the ref can still participate
4531 in dependence analysis, we just can't vectorize it. */
4532 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4533 continue;
4534 }
4535 return opt_result::failure_at (stmt_info->stmt,
4536 "not vectorized: base object not"
4537 " addressable for stmt: %G",
4538 stmt_info->stmt);
4539 }
4540
4541 if (is_a <loop_vec_info> (vinfo)
4542 && DR_STEP (dr)
4543 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4544 {
4545 if (nested_in_vect_loop_p (loop, stmt_info))
4546 return opt_result::failure_at (stmt_info->stmt,
4547 "not vectorized: "
4548 "not suitable for strided load %G",
4549 stmt_info->stmt);
4550 STMT_VINFO_STRIDED_P (stmt_info) = true;
4551 }
4552
4553 /* Update DR field in stmt_vec_info struct. */
4554
4555 /* If the dataref is in an inner-loop of the loop that is considered for
4556 for vectorization, we also want to analyze the access relative to
4557 the outer-loop (DR contains information only relative to the
4558 inner-most enclosing loop). We do that by building a reference to the
4559 first location accessed by the inner-loop, and analyze it relative to
4560 the outer-loop. */
4561 if (loop && nested_in_vect_loop_p (loop, stmt_info))
4562 {
4563 /* Build a reference to the first location accessed by the
4564 inner loop: *(BASE + INIT + OFFSET). By construction,
4565 this address must be invariant in the inner loop, so we
4566 can consider it as being used in the outer loop. */
4567 tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4568 tree offset = unshare_expr (DR_OFFSET (dr));
4569 tree init = unshare_expr (DR_INIT (dr));
4570 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4571 init, offset);
4572 tree init_addr = fold_build_pointer_plus (base, init_offset);
4573 tree init_ref = build_fold_indirect_ref (init_addr);
4574
4575 if (dump_enabled_p ())
4576 dump_printf_loc (MSG_NOTE, vect_location,
4577 "analyze in outer loop: %T\n", init_ref);
4578
4579 opt_result res
4580 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4581 init_ref, loop, stmt_info->stmt);
4582 if (!res)
4583 /* dr_analyze_innermost already explained the failure. */
4584 return res;
4585
4586 if (dump_enabled_p ())
4587 dump_printf_loc (MSG_NOTE, vect_location,
4588 "\touter base_address: %T\n"
4589 "\touter offset from base address: %T\n"
4590 "\touter constant offset from base address: %T\n"
4591 "\touter step: %T\n"
4592 "\touter base alignment: %d\n\n"
4593 "\touter base misalignment: %d\n"
4594 "\touter offset alignment: %d\n"
4595 "\touter step alignment: %d\n",
4596 STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4597 STMT_VINFO_DR_OFFSET (stmt_info),
4598 STMT_VINFO_DR_INIT (stmt_info),
4599 STMT_VINFO_DR_STEP (stmt_info),
4600 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4601 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4602 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4603 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4604 }
4605
4606 /* Set vectype for STMT. */
4607 scalar_type = TREE_TYPE (DR_REF (dr));
4608 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4609 if (!vectype)
4610 {
4611 if (dump_enabled_p ())
4612 {
4613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4614 "not vectorized: no vectype for stmt: %G",
4615 stmt_info->stmt);
4616 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4617 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4618 scalar_type);
4619 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4620 }
4621
4622 if (is_a <bb_vec_info> (vinfo))
4623 {
4624 /* No vector type is fine, the ref can still participate
4625 in dependence analysis, we just can't vectorize it. */
4626 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4627 continue;
4628 }
4629 if (fatal)
4630 *fatal = false;
4631 return opt_result::failure_at (stmt_info->stmt,
4632 "not vectorized:"
4633 " no vectype for stmt: %G"
4634 " scalar_type: %T\n",
4635 stmt_info->stmt, scalar_type);
4636 }
4637 else
4638 {
4639 if (dump_enabled_p ())
4640 dump_printf_loc (MSG_NOTE, vect_location,
4641 "got vectype for stmt: %G%T\n",
4642 stmt_info->stmt, vectype);
4643 }
4644
4645 /* Adjust the minimal vectorization factor according to the
4646 vector type. */
4647 vf = TYPE_VECTOR_SUBPARTS (vectype);
4648 *min_vf = upper_bound (*min_vf, vf);
4649
4650 /* Leave the BB vectorizer to pick the vector type later, based on
4651 the final dataref group size and SLP node size. */
4652 if (is_a <loop_vec_info> (vinfo))
4653 STMT_VINFO_VECTYPE (stmt_info) = vectype;
4654
4655 if (gatherscatter != SG_NONE)
4656 {
4657 gather_scatter_info gs_info;
4658 if (!vect_check_gather_scatter (stmt_info,
4659 as_a <loop_vec_info> (vinfo),
4660 &gs_info)
4661 || !get_vectype_for_scalar_type (vinfo,
4662 TREE_TYPE (gs_info.offset)))
4663 {
4664 if (fatal)
4665 *fatal = false;
4666 return opt_result::failure_at
4667 (stmt_info->stmt,
4668 (gatherscatter == GATHER)
4669 ? "not vectorized: not suitable for gather load %G"
4670 : "not vectorized: not suitable for scatter store %G",
4671 stmt_info->stmt);
4672 }
4673 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4674 }
4675 }
4676
4677 /* We used to stop processing and prune the list here. Verify we no
4678 longer need to. */
4679 gcc_assert (i == datarefs.length ());
4680
4681 return opt_result::success ();
4682 }
4683
4684
4685 /* Function vect_get_new_vect_var.
4686
4687 Returns a name for a new variable. The current naming scheme appends the
4688 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4689 the name of vectorizer generated variables, and appends that to NAME if
4690 provided. */
4691
4692 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4693 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4694 {
4695 const char *prefix;
4696 tree new_vect_var;
4697
4698 switch (var_kind)
4699 {
4700 case vect_simple_var:
4701 prefix = "vect";
4702 break;
4703 case vect_scalar_var:
4704 prefix = "stmp";
4705 break;
4706 case vect_mask_var:
4707 prefix = "mask";
4708 break;
4709 case vect_pointer_var:
4710 prefix = "vectp";
4711 break;
4712 default:
4713 gcc_unreachable ();
4714 }
4715
4716 if (name)
4717 {
4718 char* tmp = concat (prefix, "_", name, NULL);
4719 new_vect_var = create_tmp_reg (type, tmp);
4720 free (tmp);
4721 }
4722 else
4723 new_vect_var = create_tmp_reg (type, prefix);
4724
4725 return new_vect_var;
4726 }
4727
4728 /* Like vect_get_new_vect_var but return an SSA name. */
4729
4730 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4731 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4732 {
4733 const char *prefix;
4734 tree new_vect_var;
4735
4736 switch (var_kind)
4737 {
4738 case vect_simple_var:
4739 prefix = "vect";
4740 break;
4741 case vect_scalar_var:
4742 prefix = "stmp";
4743 break;
4744 case vect_pointer_var:
4745 prefix = "vectp";
4746 break;
4747 default:
4748 gcc_unreachable ();
4749 }
4750
4751 if (name)
4752 {
4753 char* tmp = concat (prefix, "_", name, NULL);
4754 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4755 free (tmp);
4756 }
4757 else
4758 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4759
4760 return new_vect_var;
4761 }
4762
4763 /* Duplicate points-to info on NAME from DR_INFO. */
4764
4765 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4766 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4767 {
4768 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4769 /* DR_PTR_INFO is for a base SSA name, not including constant or
4770 variable offsets in the ref so its alignment info does not apply. */
4771 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4772 }
4773
4774 /* Function vect_create_addr_base_for_vector_ref.
4775
4776 Create an expression that computes the address of the first memory location
4777 that will be accessed for a data reference.
4778
4779 Input:
4780 STMT_INFO: The statement containing the data reference.
4781 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4782 OFFSET: Optional. If supplied, it is be added to the initial address.
4783 LOOP: Specify relative to which loop-nest should the address be computed.
4784 For example, when the dataref is in an inner-loop nested in an
4785 outer-loop that is now being vectorized, LOOP can be either the
4786 outer-loop, or the inner-loop. The first memory location accessed
4787 by the following dataref ('in' points to short):
4788
4789 for (i=0; i<N; i++)
4790 for (j=0; j<M; j++)
4791 s += in[i+j]
4792
4793 is as follows:
4794 if LOOP=i_loop: &in (relative to i_loop)
4795 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4796
4797 Output:
4798 1. Return an SSA_NAME whose value is the address of the memory location of
4799 the first vector of the data reference.
4800 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4801 these statement(s) which define the returned SSA_NAME.
4802
4803 FORNOW: We are only handling array accesses with step 1. */
4804
4805 tree
vect_create_addr_base_for_vector_ref(vec_info * vinfo,stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset)4806 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4807 gimple_seq *new_stmt_list,
4808 tree offset)
4809 {
4810 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4811 struct data_reference *dr = dr_info->dr;
4812 const char *base_name;
4813 tree addr_base;
4814 tree dest;
4815 gimple_seq seq = NULL;
4816 tree vect_ptr_type;
4817 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4818 innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4819
4820 tree data_ref_base = unshare_expr (drb->base_address);
4821 tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4822 tree init = unshare_expr (drb->init);
4823
4824 if (loop_vinfo)
4825 base_name = get_name (data_ref_base);
4826 else
4827 {
4828 base_offset = ssize_int (0);
4829 init = ssize_int (0);
4830 base_name = get_name (DR_REF (dr));
4831 }
4832
4833 /* Create base_offset */
4834 base_offset = size_binop (PLUS_EXPR,
4835 fold_convert (sizetype, base_offset),
4836 fold_convert (sizetype, init));
4837
4838 if (offset)
4839 {
4840 offset = fold_convert (sizetype, offset);
4841 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4842 base_offset, offset);
4843 }
4844
4845 /* base + base_offset */
4846 if (loop_vinfo)
4847 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4848 else
4849 addr_base = build1 (ADDR_EXPR,
4850 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4851 /* Strip zero offset components since we don't need
4852 them and they can confuse late diagnostics if
4853 we CSE them wrongly. See PR106904 for example. */
4854 unshare_expr (strip_zero_offset_components
4855 (DR_REF (dr))));
4856
4857 vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4858 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4859 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4860 gimple_seq_add_seq (new_stmt_list, seq);
4861
4862 if (DR_PTR_INFO (dr)
4863 && TREE_CODE (addr_base) == SSA_NAME
4864 /* We should only duplicate pointer info to newly created SSA names. */
4865 && SSA_NAME_VAR (addr_base) == dest)
4866 {
4867 gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4868 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4869 }
4870
4871 if (dump_enabled_p ())
4872 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4873
4874 return addr_base;
4875 }
4876
4877
4878 /* Function vect_create_data_ref_ptr.
4879
4880 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4881 location accessed in the loop by STMT_INFO, along with the def-use update
4882 chain to appropriately advance the pointer through the loop iterations.
4883 Also set aliasing information for the pointer. This pointer is used by
4884 the callers to this function to create a memory reference expression for
4885 vector load/store access.
4886
4887 Input:
4888 1. STMT_INFO: a stmt that references memory. Expected to be of the form
4889 GIMPLE_ASSIGN <name, data-ref> or
4890 GIMPLE_ASSIGN <data-ref, name>.
4891 2. AGGR_TYPE: the type of the reference, which should be either a vector
4892 or an array.
4893 3. AT_LOOP: the loop where the vector memref is to be created.
4894 4. OFFSET (optional): a byte offset to be added to the initial address
4895 accessed by the data-ref in STMT_INFO.
4896 5. BSI: location where the new stmts are to be placed if there is no loop
4897 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4898 pointing to the initial address.
4899 8. IV_STEP (optional, defaults to NULL): the amount that should be added
4900 to the IV during each iteration of the loop. NULL says to move
4901 by one copy of AGGR_TYPE up or down, depending on the step of the
4902 data reference.
4903
4904 Output:
4905 1. Declare a new ptr to vector_type, and have it point to the base of the
4906 data reference (initial addressed accessed by the data reference).
4907 For example, for vector of type V8HI, the following code is generated:
4908
4909 v8hi *ap;
4910 ap = (v8hi *)initial_address;
4911
4912 if OFFSET is not supplied:
4913 initial_address = &a[init];
4914 if OFFSET is supplied:
4915 initial_address = &a[init] + OFFSET;
4916 if BYTE_OFFSET is supplied:
4917 initial_address = &a[init] + BYTE_OFFSET;
4918
4919 Return the initial_address in INITIAL_ADDRESS.
4920
4921 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4922 update the pointer in each iteration of the loop.
4923
4924 Return the increment stmt that updates the pointer in PTR_INCR.
4925
4926 3. Return the pointer. */
4927
4928 tree
vect_create_data_ref_ptr(vec_info * vinfo,stmt_vec_info stmt_info,tree aggr_type,class loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree iv_step)4929 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4930 tree aggr_type, class loop *at_loop, tree offset,
4931 tree *initial_address, gimple_stmt_iterator *gsi,
4932 gimple **ptr_incr, bool only_init,
4933 tree iv_step)
4934 {
4935 const char *base_name;
4936 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4937 class loop *loop = NULL;
4938 bool nested_in_vect_loop = false;
4939 class loop *containing_loop = NULL;
4940 tree aggr_ptr_type;
4941 tree aggr_ptr;
4942 tree new_temp;
4943 gimple_seq new_stmt_list = NULL;
4944 edge pe = NULL;
4945 basic_block new_bb;
4946 tree aggr_ptr_init;
4947 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4948 struct data_reference *dr = dr_info->dr;
4949 tree aptr;
4950 gimple_stmt_iterator incr_gsi;
4951 bool insert_after;
4952 tree indx_before_incr, indx_after_incr;
4953 gimple *incr;
4954 bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4955
4956 gcc_assert (iv_step != NULL_TREE
4957 || TREE_CODE (aggr_type) == ARRAY_TYPE
4958 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4959
4960 if (loop_vinfo)
4961 {
4962 loop = LOOP_VINFO_LOOP (loop_vinfo);
4963 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4964 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4965 pe = loop_preheader_edge (loop);
4966 }
4967 else
4968 {
4969 gcc_assert (bb_vinfo);
4970 only_init = true;
4971 *ptr_incr = NULL;
4972 }
4973
4974 /* Create an expression for the first address accessed by this load
4975 in LOOP. */
4976 base_name = get_name (DR_BASE_ADDRESS (dr));
4977
4978 if (dump_enabled_p ())
4979 {
4980 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4981 dump_printf_loc (MSG_NOTE, vect_location,
4982 "create %s-pointer variable to type: %T",
4983 get_tree_code_name (TREE_CODE (aggr_type)),
4984 aggr_type);
4985 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4986 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4987 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4988 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4989 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4990 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4991 else
4992 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4993 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4994 }
4995
4996 /* (1) Create the new aggregate-pointer variable.
4997 Vector and array types inherit the alias set of their component
4998 type by default so we need to use a ref-all pointer if the data
4999 reference does not conflict with the created aggregated data
5000 reference because it is not addressable. */
5001 bool need_ref_all = false;
5002 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5003 get_alias_set (DR_REF (dr))))
5004 need_ref_all = true;
5005 /* Likewise for any of the data references in the stmt group. */
5006 else if (DR_GROUP_SIZE (stmt_info) > 1)
5007 {
5008 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5009 do
5010 {
5011 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5012 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5013 get_alias_set (DR_REF (sdr))))
5014 {
5015 need_ref_all = true;
5016 break;
5017 }
5018 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5019 }
5020 while (sinfo);
5021 }
5022 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5023 need_ref_all);
5024 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5025
5026
5027 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5028 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5029 def-use update cycles for the pointer: one relative to the outer-loop
5030 (LOOP), which is what steps (3) and (4) below do. The other is relative
5031 to the inner-loop (which is the inner-most loop containing the dataref),
5032 and this is done be step (5) below.
5033
5034 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5035 inner-most loop, and so steps (3),(4) work the same, and step (5) is
5036 redundant. Steps (3),(4) create the following:
5037
5038 vp0 = &base_addr;
5039 LOOP: vp1 = phi(vp0,vp2)
5040 ...
5041 ...
5042 vp2 = vp1 + step
5043 goto LOOP
5044
5045 If there is an inner-loop nested in loop, then step (5) will also be
5046 applied, and an additional update in the inner-loop will be created:
5047
5048 vp0 = &base_addr;
5049 LOOP: vp1 = phi(vp0,vp2)
5050 ...
5051 inner: vp3 = phi(vp1,vp4)
5052 vp4 = vp3 + inner_step
5053 if () goto inner
5054 ...
5055 vp2 = vp1 + step
5056 if () goto LOOP */
5057
5058 /* (2) Calculate the initial address of the aggregate-pointer, and set
5059 the aggregate-pointer to point to it before the loop. */
5060
5061 /* Create: (&(base[init_val]+offset) in the loop preheader. */
5062
5063 new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5064 stmt_info, &new_stmt_list,
5065 offset);
5066 if (new_stmt_list)
5067 {
5068 if (pe)
5069 {
5070 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5071 gcc_assert (!new_bb);
5072 }
5073 else
5074 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5075 }
5076
5077 *initial_address = new_temp;
5078 aggr_ptr_init = new_temp;
5079
5080 /* (3) Handle the updating of the aggregate-pointer inside the loop.
5081 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5082 inner-loop nested in LOOP (during outer-loop vectorization). */
5083
5084 /* No update in loop is required. */
5085 if (only_init && (!loop_vinfo || at_loop == loop))
5086 aptr = aggr_ptr_init;
5087 else
5088 {
5089 /* Accesses to invariant addresses should be handled specially
5090 by the caller. */
5091 tree step = vect_dr_behavior (vinfo, dr_info)->step;
5092 gcc_assert (!integer_zerop (step));
5093
5094 if (iv_step == NULL_TREE)
5095 {
5096 /* The step of the aggregate pointer is the type size,
5097 negated for downward accesses. */
5098 iv_step = TYPE_SIZE_UNIT (aggr_type);
5099 if (tree_int_cst_sgn (step) == -1)
5100 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5101 }
5102
5103 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5104
5105 create_iv (aggr_ptr_init,
5106 fold_convert (aggr_ptr_type, iv_step),
5107 aggr_ptr, loop, &incr_gsi, insert_after,
5108 &indx_before_incr, &indx_after_incr);
5109 incr = gsi_stmt (incr_gsi);
5110
5111 /* Copy the points-to information if it exists. */
5112 if (DR_PTR_INFO (dr))
5113 {
5114 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5115 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5116 }
5117 if (ptr_incr)
5118 *ptr_incr = incr;
5119
5120 aptr = indx_before_incr;
5121 }
5122
5123 if (!nested_in_vect_loop || only_init)
5124 return aptr;
5125
5126
5127 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5128 nested in LOOP, if exists. */
5129
5130 gcc_assert (nested_in_vect_loop);
5131 if (!only_init)
5132 {
5133 standard_iv_increment_position (containing_loop, &incr_gsi,
5134 &insert_after);
5135 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5136 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5137 &indx_after_incr);
5138 incr = gsi_stmt (incr_gsi);
5139
5140 /* Copy the points-to information if it exists. */
5141 if (DR_PTR_INFO (dr))
5142 {
5143 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5144 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5145 }
5146 if (ptr_incr)
5147 *ptr_incr = incr;
5148
5149 return indx_before_incr;
5150 }
5151 else
5152 gcc_unreachable ();
5153 }
5154
5155
5156 /* Function bump_vector_ptr
5157
5158 Increment a pointer (to a vector type) by vector-size. If requested,
5159 i.e. if PTR-INCR is given, then also connect the new increment stmt
5160 to the existing def-use update-chain of the pointer, by modifying
5161 the PTR_INCR as illustrated below:
5162
5163 The pointer def-use update-chain before this function:
5164 DATAREF_PTR = phi (p_0, p_2)
5165 ....
5166 PTR_INCR: p_2 = DATAREF_PTR + step
5167
5168 The pointer def-use update-chain after this function:
5169 DATAREF_PTR = phi (p_0, p_2)
5170 ....
5171 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5172 ....
5173 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5174
5175 Input:
5176 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5177 in the loop.
5178 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5179 the loop. The increment amount across iterations is expected
5180 to be vector_size.
5181 BSI - location where the new update stmt is to be placed.
5182 STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5183 BUMP - optional. The offset by which to bump the pointer. If not given,
5184 the offset is assumed to be vector_size.
5185
5186 Output: Return NEW_DATAREF_PTR as illustrated above.
5187
5188 */
5189
5190 tree
bump_vector_ptr(vec_info * vinfo,tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)5191 bump_vector_ptr (vec_info *vinfo,
5192 tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5193 stmt_vec_info stmt_info, tree bump)
5194 {
5195 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5196 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5197 tree update = TYPE_SIZE_UNIT (vectype);
5198 gimple *incr_stmt;
5199 ssa_op_iter iter;
5200 use_operand_p use_p;
5201 tree new_dataref_ptr;
5202
5203 if (bump)
5204 update = bump;
5205
5206 if (TREE_CODE (dataref_ptr) == SSA_NAME)
5207 new_dataref_ptr = copy_ssa_name (dataref_ptr);
5208 else
5209 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5210 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5211 dataref_ptr, update);
5212 vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5213 /* Fold the increment, avoiding excessive chains use-def chains of
5214 those, leading to compile-time issues for passes until the next
5215 forwprop pass which would do this as well. */
5216 gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5217 if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5218 {
5219 incr_stmt = gsi_stmt (fold_gsi);
5220 update_stmt (incr_stmt);
5221 }
5222
5223 /* Copy the points-to information if it exists. */
5224 if (DR_PTR_INFO (dr))
5225 {
5226 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5227 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5228 }
5229
5230 if (!ptr_incr)
5231 return new_dataref_ptr;
5232
5233 /* Update the vector-pointer's cross-iteration increment. */
5234 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5235 {
5236 tree use = USE_FROM_PTR (use_p);
5237
5238 if (use == dataref_ptr)
5239 SET_USE (use_p, new_dataref_ptr);
5240 else
5241 gcc_assert (operand_equal_p (use, update, 0));
5242 }
5243
5244 return new_dataref_ptr;
5245 }
5246
5247
5248 /* Copy memory reference info such as base/clique from the SRC reference
5249 to the DEST MEM_REF. */
5250
5251 void
vect_copy_ref_info(tree dest,tree src)5252 vect_copy_ref_info (tree dest, tree src)
5253 {
5254 if (TREE_CODE (dest) != MEM_REF)
5255 return;
5256
5257 tree src_base = src;
5258 while (handled_component_p (src_base))
5259 src_base = TREE_OPERAND (src_base, 0);
5260 if (TREE_CODE (src_base) != MEM_REF
5261 && TREE_CODE (src_base) != TARGET_MEM_REF)
5262 return;
5263
5264 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5265 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5266 }
5267
5268
5269 /* Function vect_create_destination_var.
5270
5271 Create a new temporary of type VECTYPE. */
5272
5273 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5274 vect_create_destination_var (tree scalar_dest, tree vectype)
5275 {
5276 tree vec_dest;
5277 const char *name;
5278 char *new_name;
5279 tree type;
5280 enum vect_var_kind kind;
5281
5282 kind = vectype
5283 ? VECTOR_BOOLEAN_TYPE_P (vectype)
5284 ? vect_mask_var
5285 : vect_simple_var
5286 : vect_scalar_var;
5287 type = vectype ? vectype : TREE_TYPE (scalar_dest);
5288
5289 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5290
5291 name = get_name (scalar_dest);
5292 if (name)
5293 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5294 else
5295 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5296 vec_dest = vect_get_new_vect_var (type, kind, new_name);
5297 free (new_name);
5298
5299 return vec_dest;
5300 }
5301
5302 /* Function vect_grouped_store_supported.
5303
5304 Returns TRUE if interleave high and interleave low permutations
5305 are supported, and FALSE otherwise. */
5306
5307 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5308 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5309 {
5310 machine_mode mode = TYPE_MODE (vectype);
5311
5312 /* vect_permute_store_chain requires the group size to be equal to 3 or
5313 be a power of two. */
5314 if (count != 3 && exact_log2 (count) == -1)
5315 {
5316 if (dump_enabled_p ())
5317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5318 "the size of the group of accesses"
5319 " is not a power of 2 or not eqaul to 3\n");
5320 return false;
5321 }
5322
5323 /* Check that the permutation is supported. */
5324 if (VECTOR_MODE_P (mode))
5325 {
5326 unsigned int i;
5327 if (count == 3)
5328 {
5329 unsigned int j0 = 0, j1 = 0, j2 = 0;
5330 unsigned int i, j;
5331
5332 unsigned int nelt;
5333 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5334 {
5335 if (dump_enabled_p ())
5336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337 "cannot handle groups of 3 stores for"
5338 " variable-length vectors\n");
5339 return false;
5340 }
5341
5342 vec_perm_builder sel (nelt, nelt, 1);
5343 sel.quick_grow (nelt);
5344 vec_perm_indices indices;
5345 for (j = 0; j < 3; j++)
5346 {
5347 int nelt0 = ((3 - j) * nelt) % 3;
5348 int nelt1 = ((3 - j) * nelt + 1) % 3;
5349 int nelt2 = ((3 - j) * nelt + 2) % 3;
5350 for (i = 0; i < nelt; i++)
5351 {
5352 if (3 * i + nelt0 < nelt)
5353 sel[3 * i + nelt0] = j0++;
5354 if (3 * i + nelt1 < nelt)
5355 sel[3 * i + nelt1] = nelt + j1++;
5356 if (3 * i + nelt2 < nelt)
5357 sel[3 * i + nelt2] = 0;
5358 }
5359 indices.new_vector (sel, 2, nelt);
5360 if (!can_vec_perm_const_p (mode, indices))
5361 {
5362 if (dump_enabled_p ())
5363 dump_printf (MSG_MISSED_OPTIMIZATION,
5364 "permutation op not supported by target.\n");
5365 return false;
5366 }
5367
5368 for (i = 0; i < nelt; i++)
5369 {
5370 if (3 * i + nelt0 < nelt)
5371 sel[3 * i + nelt0] = 3 * i + nelt0;
5372 if (3 * i + nelt1 < nelt)
5373 sel[3 * i + nelt1] = 3 * i + nelt1;
5374 if (3 * i + nelt2 < nelt)
5375 sel[3 * i + nelt2] = nelt + j2++;
5376 }
5377 indices.new_vector (sel, 2, nelt);
5378 if (!can_vec_perm_const_p (mode, indices))
5379 {
5380 if (dump_enabled_p ())
5381 dump_printf (MSG_MISSED_OPTIMIZATION,
5382 "permutation op not supported by target.\n");
5383 return false;
5384 }
5385 }
5386 return true;
5387 }
5388 else
5389 {
5390 /* If length is not equal to 3 then only power of 2 is supported. */
5391 gcc_assert (pow2p_hwi (count));
5392 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5393
5394 /* The encoding has 2 interleaved stepped patterns. */
5395 vec_perm_builder sel (nelt, 2, 3);
5396 sel.quick_grow (6);
5397 for (i = 0; i < 3; i++)
5398 {
5399 sel[i * 2] = i;
5400 sel[i * 2 + 1] = i + nelt;
5401 }
5402 vec_perm_indices indices (sel, 2, nelt);
5403 if (can_vec_perm_const_p (mode, indices))
5404 {
5405 for (i = 0; i < 6; i++)
5406 sel[i] += exact_div (nelt, 2);
5407 indices.new_vector (sel, 2, nelt);
5408 if (can_vec_perm_const_p (mode, indices))
5409 return true;
5410 }
5411 }
5412 }
5413
5414 if (dump_enabled_p ())
5415 dump_printf (MSG_MISSED_OPTIMIZATION,
5416 "permutation op not supported by target.\n");
5417 return false;
5418 }
5419
5420
5421 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5422 type VECTYPE. MASKED_P says whether the masked form is needed. */
5423
5424 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5425 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5426 bool masked_p)
5427 {
5428 if (masked_p)
5429 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5430 vec_mask_store_lanes_optab,
5431 vectype, count);
5432 else
5433 return vect_lanes_optab_supported_p ("vec_store_lanes",
5434 vec_store_lanes_optab,
5435 vectype, count);
5436 }
5437
5438
5439 /* Function vect_permute_store_chain.
5440
5441 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5442 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5443 the data correctly for the stores. Return the final references for stores
5444 in RESULT_CHAIN.
5445
5446 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5447 The input is 4 vectors each containing 8 elements. We assign a number to
5448 each element, the input sequence is:
5449
5450 1st vec: 0 1 2 3 4 5 6 7
5451 2nd vec: 8 9 10 11 12 13 14 15
5452 3rd vec: 16 17 18 19 20 21 22 23
5453 4th vec: 24 25 26 27 28 29 30 31
5454
5455 The output sequence should be:
5456
5457 1st vec: 0 8 16 24 1 9 17 25
5458 2nd vec: 2 10 18 26 3 11 19 27
5459 3rd vec: 4 12 20 28 5 13 21 30
5460 4th vec: 6 14 22 30 7 15 23 31
5461
5462 i.e., we interleave the contents of the four vectors in their order.
5463
5464 We use interleave_high/low instructions to create such output. The input of
5465 each interleave_high/low operation is two vectors:
5466 1st vec 2nd vec
5467 0 1 2 3 4 5 6 7
5468 the even elements of the result vector are obtained left-to-right from the
5469 high/low elements of the first vector. The odd elements of the result are
5470 obtained left-to-right from the high/low elements of the second vector.
5471 The output of interleave_high will be: 0 4 1 5
5472 and of interleave_low: 2 6 3 7
5473
5474
5475 The permutation is done in log LENGTH stages. In each stage interleave_high
5476 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5477 where the first argument is taken from the first half of DR_CHAIN and the
5478 second argument from it's second half.
5479 In our example,
5480
5481 I1: interleave_high (1st vec, 3rd vec)
5482 I2: interleave_low (1st vec, 3rd vec)
5483 I3: interleave_high (2nd vec, 4th vec)
5484 I4: interleave_low (2nd vec, 4th vec)
5485
5486 The output for the first stage is:
5487
5488 I1: 0 16 1 17 2 18 3 19
5489 I2: 4 20 5 21 6 22 7 23
5490 I3: 8 24 9 25 10 26 11 27
5491 I4: 12 28 13 29 14 30 15 31
5492
5493 The output of the second stage, i.e. the final result is:
5494
5495 I1: 0 8 16 24 1 9 17 25
5496 I2: 2 10 18 26 3 11 19 27
5497 I3: 4 12 20 28 5 13 21 30
5498 I4: 6 14 22 30 7 15 23 31. */
5499
5500 void
vect_permute_store_chain(vec_info * vinfo,vec<tree> & dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5501 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5502 unsigned int length,
5503 stmt_vec_info stmt_info,
5504 gimple_stmt_iterator *gsi,
5505 vec<tree> *result_chain)
5506 {
5507 tree vect1, vect2, high, low;
5508 gimple *perm_stmt;
5509 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5510 tree perm_mask_low, perm_mask_high;
5511 tree data_ref;
5512 tree perm3_mask_low, perm3_mask_high;
5513 unsigned int i, j, n, log_length = exact_log2 (length);
5514
5515 result_chain->quick_grow (length);
5516 memcpy (result_chain->address (), dr_chain.address (),
5517 length * sizeof (tree));
5518
5519 if (length == 3)
5520 {
5521 /* vect_grouped_store_supported ensures that this is constant. */
5522 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5523 unsigned int j0 = 0, j1 = 0, j2 = 0;
5524
5525 vec_perm_builder sel (nelt, nelt, 1);
5526 sel.quick_grow (nelt);
5527 vec_perm_indices indices;
5528 for (j = 0; j < 3; j++)
5529 {
5530 int nelt0 = ((3 - j) * nelt) % 3;
5531 int nelt1 = ((3 - j) * nelt + 1) % 3;
5532 int nelt2 = ((3 - j) * nelt + 2) % 3;
5533
5534 for (i = 0; i < nelt; i++)
5535 {
5536 if (3 * i + nelt0 < nelt)
5537 sel[3 * i + nelt0] = j0++;
5538 if (3 * i + nelt1 < nelt)
5539 sel[3 * i + nelt1] = nelt + j1++;
5540 if (3 * i + nelt2 < nelt)
5541 sel[3 * i + nelt2] = 0;
5542 }
5543 indices.new_vector (sel, 2, nelt);
5544 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5545
5546 for (i = 0; i < nelt; i++)
5547 {
5548 if (3 * i + nelt0 < nelt)
5549 sel[3 * i + nelt0] = 3 * i + nelt0;
5550 if (3 * i + nelt1 < nelt)
5551 sel[3 * i + nelt1] = 3 * i + nelt1;
5552 if (3 * i + nelt2 < nelt)
5553 sel[3 * i + nelt2] = nelt + j2++;
5554 }
5555 indices.new_vector (sel, 2, nelt);
5556 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5557
5558 vect1 = dr_chain[0];
5559 vect2 = dr_chain[1];
5560
5561 /* Create interleaving stmt:
5562 low = VEC_PERM_EXPR <vect1, vect2,
5563 {j, nelt, *, j + 1, nelt + j + 1, *,
5564 j + 2, nelt + j + 2, *, ...}> */
5565 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5566 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5567 vect2, perm3_mask_low);
5568 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5569
5570 vect1 = data_ref;
5571 vect2 = dr_chain[2];
5572 /* Create interleaving stmt:
5573 low = VEC_PERM_EXPR <vect1, vect2,
5574 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5575 6, 7, nelt + j + 2, ...}> */
5576 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5577 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5578 vect2, perm3_mask_high);
5579 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5580 (*result_chain)[j] = data_ref;
5581 }
5582 }
5583 else
5584 {
5585 /* If length is not equal to 3 then only power of 2 is supported. */
5586 gcc_assert (pow2p_hwi (length));
5587
5588 /* The encoding has 2 interleaved stepped patterns. */
5589 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5590 vec_perm_builder sel (nelt, 2, 3);
5591 sel.quick_grow (6);
5592 for (i = 0; i < 3; i++)
5593 {
5594 sel[i * 2] = i;
5595 sel[i * 2 + 1] = i + nelt;
5596 }
5597 vec_perm_indices indices (sel, 2, nelt);
5598 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5599
5600 for (i = 0; i < 6; i++)
5601 sel[i] += exact_div (nelt, 2);
5602 indices.new_vector (sel, 2, nelt);
5603 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5604
5605 for (i = 0, n = log_length; i < n; i++)
5606 {
5607 for (j = 0; j < length/2; j++)
5608 {
5609 vect1 = dr_chain[j];
5610 vect2 = dr_chain[j+length/2];
5611
5612 /* Create interleaving stmt:
5613 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5614 ...}> */
5615 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5616 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5617 vect2, perm_mask_high);
5618 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5619 (*result_chain)[2*j] = high;
5620
5621 /* Create interleaving stmt:
5622 low = VEC_PERM_EXPR <vect1, vect2,
5623 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5624 ...}> */
5625 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5626 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5627 vect2, perm_mask_low);
5628 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5629 (*result_chain)[2*j+1] = low;
5630 }
5631 memcpy (dr_chain.address (), result_chain->address (),
5632 length * sizeof (tree));
5633 }
5634 }
5635 }
5636
5637 /* Function vect_setup_realignment
5638
5639 This function is called when vectorizing an unaligned load using
5640 the dr_explicit_realign[_optimized] scheme.
5641 This function generates the following code at the loop prolog:
5642
5643 p = initial_addr;
5644 x msq_init = *(floor(p)); # prolog load
5645 realignment_token = call target_builtin;
5646 loop:
5647 x msq = phi (msq_init, ---)
5648
5649 The stmts marked with x are generated only for the case of
5650 dr_explicit_realign_optimized.
5651
5652 The code above sets up a new (vector) pointer, pointing to the first
5653 location accessed by STMT_INFO, and a "floor-aligned" load using that
5654 pointer. It also generates code to compute the "realignment-token"
5655 (if the relevant target hook was defined), and creates a phi-node at the
5656 loop-header bb whose arguments are the result of the prolog-load (created
5657 by this function) and the result of a load that takes place in the loop
5658 (to be created by the caller to this function).
5659
5660 For the case of dr_explicit_realign_optimized:
5661 The caller to this function uses the phi-result (msq) to create the
5662 realignment code inside the loop, and sets up the missing phi argument,
5663 as follows:
5664 loop:
5665 msq = phi (msq_init, lsq)
5666 lsq = *(floor(p')); # load in loop
5667 result = realign_load (msq, lsq, realignment_token);
5668
5669 For the case of dr_explicit_realign:
5670 loop:
5671 msq = *(floor(p)); # load in loop
5672 p' = p + (VS-1);
5673 lsq = *(floor(p')); # load in loop
5674 result = realign_load (msq, lsq, realignment_token);
5675
5676 Input:
5677 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5678 a memory location that may be unaligned.
5679 BSI - place where new code is to be inserted.
5680 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5681 is used.
5682
5683 Output:
5684 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5685 target hook, if defined.
5686 Return value - the result of the loop-header phi node. */
5687
5688 tree
vect_setup_realignment(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,class loop ** at_loop)5689 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5690 gimple_stmt_iterator *gsi, tree *realignment_token,
5691 enum dr_alignment_support alignment_support_scheme,
5692 tree init_addr,
5693 class loop **at_loop)
5694 {
5695 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5696 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5697 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5698 struct data_reference *dr = dr_info->dr;
5699 class loop *loop = NULL;
5700 edge pe = NULL;
5701 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5702 tree vec_dest;
5703 gimple *inc;
5704 tree ptr;
5705 tree data_ref;
5706 basic_block new_bb;
5707 tree msq_init = NULL_TREE;
5708 tree new_temp;
5709 gphi *phi_stmt;
5710 tree msq = NULL_TREE;
5711 gimple_seq stmts = NULL;
5712 bool compute_in_loop = false;
5713 bool nested_in_vect_loop = false;
5714 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5715 class loop *loop_for_initial_load = NULL;
5716
5717 if (loop_vinfo)
5718 {
5719 loop = LOOP_VINFO_LOOP (loop_vinfo);
5720 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5721 }
5722
5723 gcc_assert (alignment_support_scheme == dr_explicit_realign
5724 || alignment_support_scheme == dr_explicit_realign_optimized);
5725
5726 /* We need to generate three things:
5727 1. the misalignment computation
5728 2. the extra vector load (for the optimized realignment scheme).
5729 3. the phi node for the two vectors from which the realignment is
5730 done (for the optimized realignment scheme). */
5731
5732 /* 1. Determine where to generate the misalignment computation.
5733
5734 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5735 calculation will be generated by this function, outside the loop (in the
5736 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5737 caller, inside the loop.
5738
5739 Background: If the misalignment remains fixed throughout the iterations of
5740 the loop, then both realignment schemes are applicable, and also the
5741 misalignment computation can be done outside LOOP. This is because we are
5742 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5743 are a multiple of VS (the Vector Size), and therefore the misalignment in
5744 different vectorized LOOP iterations is always the same.
5745 The problem arises only if the memory access is in an inner-loop nested
5746 inside LOOP, which is now being vectorized using outer-loop vectorization.
5747 This is the only case when the misalignment of the memory access may not
5748 remain fixed throughout the iterations of the inner-loop (as explained in
5749 detail in vect_supportable_dr_alignment). In this case, not only is the
5750 optimized realignment scheme not applicable, but also the misalignment
5751 computation (and generation of the realignment token that is passed to
5752 REALIGN_LOAD) have to be done inside the loop.
5753
5754 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5755 or not, which in turn determines if the misalignment is computed inside
5756 the inner-loop, or outside LOOP. */
5757
5758 if (init_addr != NULL_TREE || !loop_vinfo)
5759 {
5760 compute_in_loop = true;
5761 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5762 }
5763
5764
5765 /* 2. Determine where to generate the extra vector load.
5766
5767 For the optimized realignment scheme, instead of generating two vector
5768 loads in each iteration, we generate a single extra vector load in the
5769 preheader of the loop, and in each iteration reuse the result of the
5770 vector load from the previous iteration. In case the memory access is in
5771 an inner-loop nested inside LOOP, which is now being vectorized using
5772 outer-loop vectorization, we need to determine whether this initial vector
5773 load should be generated at the preheader of the inner-loop, or can be
5774 generated at the preheader of LOOP. If the memory access has no evolution
5775 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5776 to be generated inside LOOP (in the preheader of the inner-loop). */
5777
5778 if (nested_in_vect_loop)
5779 {
5780 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5781 bool invariant_in_outerloop =
5782 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5783 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5784 }
5785 else
5786 loop_for_initial_load = loop;
5787 if (at_loop)
5788 *at_loop = loop_for_initial_load;
5789
5790 if (loop_for_initial_load)
5791 pe = loop_preheader_edge (loop_for_initial_load);
5792
5793 /* 3. For the case of the optimized realignment, create the first vector
5794 load at the loop preheader. */
5795
5796 if (alignment_support_scheme == dr_explicit_realign_optimized)
5797 {
5798 /* Create msq_init = *(floor(p1)) in the loop preheader */
5799 gassign *new_stmt;
5800
5801 gcc_assert (!compute_in_loop);
5802 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5803 ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5804 loop_for_initial_load, NULL_TREE,
5805 &init_addr, NULL, &inc, true);
5806 if (TREE_CODE (ptr) == SSA_NAME)
5807 new_temp = copy_ssa_name (ptr);
5808 else
5809 new_temp = make_ssa_name (TREE_TYPE (ptr));
5810 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5811 tree type = TREE_TYPE (ptr);
5812 new_stmt = gimple_build_assign
5813 (new_temp, BIT_AND_EXPR, ptr,
5814 fold_build2 (MINUS_EXPR, type,
5815 build_int_cst (type, 0),
5816 build_int_cst (type, align)));
5817 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5818 gcc_assert (!new_bb);
5819 data_ref
5820 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5821 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5822 vect_copy_ref_info (data_ref, DR_REF (dr));
5823 new_stmt = gimple_build_assign (vec_dest, data_ref);
5824 new_temp = make_ssa_name (vec_dest, new_stmt);
5825 gimple_assign_set_lhs (new_stmt, new_temp);
5826 if (pe)
5827 {
5828 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5829 gcc_assert (!new_bb);
5830 }
5831 else
5832 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5833
5834 msq_init = gimple_assign_lhs (new_stmt);
5835 }
5836
5837 /* 4. Create realignment token using a target builtin, if available.
5838 It is done either inside the containing loop, or before LOOP (as
5839 determined above). */
5840
5841 if (targetm.vectorize.builtin_mask_for_load)
5842 {
5843 gcall *new_stmt;
5844 tree builtin_decl;
5845
5846 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5847 if (!init_addr)
5848 {
5849 /* Generate the INIT_ADDR computation outside LOOP. */
5850 init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5851 stmt_info, &stmts,
5852 NULL_TREE);
5853 if (loop)
5854 {
5855 pe = loop_preheader_edge (loop);
5856 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5857 gcc_assert (!new_bb);
5858 }
5859 else
5860 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5861 }
5862
5863 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5864 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5865 vec_dest =
5866 vect_create_destination_var (scalar_dest,
5867 gimple_call_return_type (new_stmt));
5868 new_temp = make_ssa_name (vec_dest, new_stmt);
5869 gimple_call_set_lhs (new_stmt, new_temp);
5870
5871 if (compute_in_loop)
5872 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5873 else
5874 {
5875 /* Generate the misalignment computation outside LOOP. */
5876 pe = loop_preheader_edge (loop);
5877 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5878 gcc_assert (!new_bb);
5879 }
5880
5881 *realignment_token = gimple_call_lhs (new_stmt);
5882
5883 /* The result of the CALL_EXPR to this builtin is determined from
5884 the value of the parameter and no global variables are touched
5885 which makes the builtin a "const" function. Requiring the
5886 builtin to have the "const" attribute makes it unnecessary
5887 to call mark_call_clobbered. */
5888 gcc_assert (TREE_READONLY (builtin_decl));
5889 }
5890
5891 if (alignment_support_scheme == dr_explicit_realign)
5892 return msq;
5893
5894 gcc_assert (!compute_in_loop);
5895 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5896
5897
5898 /* 5. Create msq = phi <msq_init, lsq> in loop */
5899
5900 pe = loop_preheader_edge (containing_loop);
5901 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5902 msq = make_ssa_name (vec_dest);
5903 phi_stmt = create_phi_node (msq, containing_loop->header);
5904 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5905
5906 return msq;
5907 }
5908
5909
5910 /* Function vect_grouped_load_supported.
5911
5912 COUNT is the size of the load group (the number of statements plus the
5913 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5914 only one statement, with a gap of COUNT - 1.
5915
5916 Returns true if a suitable permute exists. */
5917
5918 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5919 vect_grouped_load_supported (tree vectype, bool single_element_p,
5920 unsigned HOST_WIDE_INT count)
5921 {
5922 machine_mode mode = TYPE_MODE (vectype);
5923
5924 /* If this is single-element interleaving with an element distance
5925 that leaves unused vector loads around punt - we at least create
5926 very sub-optimal code in that case (and blow up memory,
5927 see PR65518). */
5928 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5929 {
5930 if (dump_enabled_p ())
5931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5932 "single-element interleaving not supported "
5933 "for not adjacent vector loads\n");
5934 return false;
5935 }
5936
5937 /* vect_permute_load_chain requires the group size to be equal to 3 or
5938 be a power of two. */
5939 if (count != 3 && exact_log2 (count) == -1)
5940 {
5941 if (dump_enabled_p ())
5942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5943 "the size of the group of accesses"
5944 " is not a power of 2 or not equal to 3\n");
5945 return false;
5946 }
5947
5948 /* Check that the permutation is supported. */
5949 if (VECTOR_MODE_P (mode))
5950 {
5951 unsigned int i, j;
5952 if (count == 3)
5953 {
5954 unsigned int nelt;
5955 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5956 {
5957 if (dump_enabled_p ())
5958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5959 "cannot handle groups of 3 loads for"
5960 " variable-length vectors\n");
5961 return false;
5962 }
5963
5964 vec_perm_builder sel (nelt, nelt, 1);
5965 sel.quick_grow (nelt);
5966 vec_perm_indices indices;
5967 unsigned int k;
5968 for (k = 0; k < 3; k++)
5969 {
5970 for (i = 0; i < nelt; i++)
5971 if (3 * i + k < 2 * nelt)
5972 sel[i] = 3 * i + k;
5973 else
5974 sel[i] = 0;
5975 indices.new_vector (sel, 2, nelt);
5976 if (!can_vec_perm_const_p (mode, indices))
5977 {
5978 if (dump_enabled_p ())
5979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5980 "shuffle of 3 loads is not supported by"
5981 " target\n");
5982 return false;
5983 }
5984 for (i = 0, j = 0; i < nelt; i++)
5985 if (3 * i + k < 2 * nelt)
5986 sel[i] = i;
5987 else
5988 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5989 indices.new_vector (sel, 2, nelt);
5990 if (!can_vec_perm_const_p (mode, indices))
5991 {
5992 if (dump_enabled_p ())
5993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5994 "shuffle of 3 loads is not supported by"
5995 " target\n");
5996 return false;
5997 }
5998 }
5999 return true;
6000 }
6001 else
6002 {
6003 /* If length is not equal to 3 then only power of 2 is supported. */
6004 gcc_assert (pow2p_hwi (count));
6005 poly_uint64 nelt = GET_MODE_NUNITS (mode);
6006
6007 /* The encoding has a single stepped pattern. */
6008 vec_perm_builder sel (nelt, 1, 3);
6009 sel.quick_grow (3);
6010 for (i = 0; i < 3; i++)
6011 sel[i] = i * 2;
6012 vec_perm_indices indices (sel, 2, nelt);
6013 if (can_vec_perm_const_p (mode, indices))
6014 {
6015 for (i = 0; i < 3; i++)
6016 sel[i] = i * 2 + 1;
6017 indices.new_vector (sel, 2, nelt);
6018 if (can_vec_perm_const_p (mode, indices))
6019 return true;
6020 }
6021 }
6022 }
6023
6024 if (dump_enabled_p ())
6025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026 "extract even/odd not supported by target\n");
6027 return false;
6028 }
6029
6030 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6031 type VECTYPE. MASKED_P says whether the masked form is needed. */
6032
6033 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)6034 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6035 bool masked_p)
6036 {
6037 if (masked_p)
6038 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6039 vec_mask_load_lanes_optab,
6040 vectype, count);
6041 else
6042 return vect_lanes_optab_supported_p ("vec_load_lanes",
6043 vec_load_lanes_optab,
6044 vectype, count);
6045 }
6046
6047 /* Function vect_permute_load_chain.
6048
6049 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6050 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6051 the input data correctly. Return the final references for loads in
6052 RESULT_CHAIN.
6053
6054 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6055 The input is 4 vectors each containing 8 elements. We assign a number to each
6056 element, the input sequence is:
6057
6058 1st vec: 0 1 2 3 4 5 6 7
6059 2nd vec: 8 9 10 11 12 13 14 15
6060 3rd vec: 16 17 18 19 20 21 22 23
6061 4th vec: 24 25 26 27 28 29 30 31
6062
6063 The output sequence should be:
6064
6065 1st vec: 0 4 8 12 16 20 24 28
6066 2nd vec: 1 5 9 13 17 21 25 29
6067 3rd vec: 2 6 10 14 18 22 26 30
6068 4th vec: 3 7 11 15 19 23 27 31
6069
6070 i.e., the first output vector should contain the first elements of each
6071 interleaving group, etc.
6072
6073 We use extract_even/odd instructions to create such output. The input of
6074 each extract_even/odd operation is two vectors
6075 1st vec 2nd vec
6076 0 1 2 3 4 5 6 7
6077
6078 and the output is the vector of extracted even/odd elements. The output of
6079 extract_even will be: 0 2 4 6
6080 and of extract_odd: 1 3 5 7
6081
6082
6083 The permutation is done in log LENGTH stages. In each stage extract_even
6084 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6085 their order. In our example,
6086
6087 E1: extract_even (1st vec, 2nd vec)
6088 E2: extract_odd (1st vec, 2nd vec)
6089 E3: extract_even (3rd vec, 4th vec)
6090 E4: extract_odd (3rd vec, 4th vec)
6091
6092 The output for the first stage will be:
6093
6094 E1: 0 2 4 6 8 10 12 14
6095 E2: 1 3 5 7 9 11 13 15
6096 E3: 16 18 20 22 24 26 28 30
6097 E4: 17 19 21 23 25 27 29 31
6098
6099 In order to proceed and create the correct sequence for the next stage (or
6100 for the correct output, if the second stage is the last one, as in our
6101 example), we first put the output of extract_even operation and then the
6102 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6103 The input for the second stage is:
6104
6105 1st vec (E1): 0 2 4 6 8 10 12 14
6106 2nd vec (E3): 16 18 20 22 24 26 28 30
6107 3rd vec (E2): 1 3 5 7 9 11 13 15
6108 4th vec (E4): 17 19 21 23 25 27 29 31
6109
6110 The output of the second stage:
6111
6112 E1: 0 4 8 12 16 20 24 28
6113 E2: 2 6 10 14 18 22 26 30
6114 E3: 1 5 9 13 17 21 25 29
6115 E4: 3 7 11 15 19 23 27 31
6116
6117 And RESULT_CHAIN after reordering:
6118
6119 1st vec (E1): 0 4 8 12 16 20 24 28
6120 2nd vec (E3): 1 5 9 13 17 21 25 29
6121 3rd vec (E2): 2 6 10 14 18 22 26 30
6122 4th vec (E4): 3 7 11 15 19 23 27 31. */
6123
6124 static void
vect_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6125 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6126 unsigned int length,
6127 stmt_vec_info stmt_info,
6128 gimple_stmt_iterator *gsi,
6129 vec<tree> *result_chain)
6130 {
6131 tree data_ref, first_vect, second_vect;
6132 tree perm_mask_even, perm_mask_odd;
6133 tree perm3_mask_low, perm3_mask_high;
6134 gimple *perm_stmt;
6135 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6136 unsigned int i, j, log_length = exact_log2 (length);
6137
6138 result_chain->quick_grow (length);
6139 memcpy (result_chain->address (), dr_chain.address (),
6140 length * sizeof (tree));
6141
6142 if (length == 3)
6143 {
6144 /* vect_grouped_load_supported ensures that this is constant. */
6145 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6146 unsigned int k;
6147
6148 vec_perm_builder sel (nelt, nelt, 1);
6149 sel.quick_grow (nelt);
6150 vec_perm_indices indices;
6151 for (k = 0; k < 3; k++)
6152 {
6153 for (i = 0; i < nelt; i++)
6154 if (3 * i + k < 2 * nelt)
6155 sel[i] = 3 * i + k;
6156 else
6157 sel[i] = 0;
6158 indices.new_vector (sel, 2, nelt);
6159 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6160
6161 for (i = 0, j = 0; i < nelt; i++)
6162 if (3 * i + k < 2 * nelt)
6163 sel[i] = i;
6164 else
6165 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6166 indices.new_vector (sel, 2, nelt);
6167 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6168
6169 first_vect = dr_chain[0];
6170 second_vect = dr_chain[1];
6171
6172 /* Create interleaving stmt (low part of):
6173 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6174 ...}> */
6175 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6176 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6177 second_vect, perm3_mask_low);
6178 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6179
6180 /* Create interleaving stmt (high part of):
6181 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6182 ...}> */
6183 first_vect = data_ref;
6184 second_vect = dr_chain[2];
6185 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6186 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6187 second_vect, perm3_mask_high);
6188 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6189 (*result_chain)[k] = data_ref;
6190 }
6191 }
6192 else
6193 {
6194 /* If length is not equal to 3 then only power of 2 is supported. */
6195 gcc_assert (pow2p_hwi (length));
6196
6197 /* The encoding has a single stepped pattern. */
6198 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6199 vec_perm_builder sel (nelt, 1, 3);
6200 sel.quick_grow (3);
6201 for (i = 0; i < 3; ++i)
6202 sel[i] = i * 2;
6203 vec_perm_indices indices (sel, 2, nelt);
6204 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6205
6206 for (i = 0; i < 3; ++i)
6207 sel[i] = i * 2 + 1;
6208 indices.new_vector (sel, 2, nelt);
6209 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6210
6211 for (i = 0; i < log_length; i++)
6212 {
6213 for (j = 0; j < length; j += 2)
6214 {
6215 first_vect = dr_chain[j];
6216 second_vect = dr_chain[j+1];
6217
6218 /* data_ref = permute_even (first_data_ref, second_data_ref); */
6219 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6220 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6221 first_vect, second_vect,
6222 perm_mask_even);
6223 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6224 (*result_chain)[j/2] = data_ref;
6225
6226 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6227 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6228 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6229 first_vect, second_vect,
6230 perm_mask_odd);
6231 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6232 (*result_chain)[j/2+length/2] = data_ref;
6233 }
6234 memcpy (dr_chain.address (), result_chain->address (),
6235 length * sizeof (tree));
6236 }
6237 }
6238 }
6239
6240 /* Function vect_shift_permute_load_chain.
6241
6242 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6243 sequence of stmts to reorder the input data accordingly.
6244 Return the final references for loads in RESULT_CHAIN.
6245 Return true if successed, false otherwise.
6246
6247 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6248 The input is 3 vectors each containing 8 elements. We assign a
6249 number to each element, the input sequence is:
6250
6251 1st vec: 0 1 2 3 4 5 6 7
6252 2nd vec: 8 9 10 11 12 13 14 15
6253 3rd vec: 16 17 18 19 20 21 22 23
6254
6255 The output sequence should be:
6256
6257 1st vec: 0 3 6 9 12 15 18 21
6258 2nd vec: 1 4 7 10 13 16 19 22
6259 3rd vec: 2 5 8 11 14 17 20 23
6260
6261 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6262
6263 First we shuffle all 3 vectors to get correct elements order:
6264
6265 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6266 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6267 3rd vec: (16 19 22) (17 20 23) (18 21)
6268
6269 Next we unite and shift vector 3 times:
6270
6271 1st step:
6272 shift right by 6 the concatenation of:
6273 "1st vec" and "2nd vec"
6274 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6275 "2nd vec" and "3rd vec"
6276 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6277 "3rd vec" and "1st vec"
6278 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6279 | New vectors |
6280
6281 So that now new vectors are:
6282
6283 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6284 2nd vec: (10 13) (16 19 22) (17 20 23)
6285 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6286
6287 2nd step:
6288 shift right by 5 the concatenation of:
6289 "1st vec" and "3rd vec"
6290 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6291 "2nd vec" and "1st vec"
6292 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6293 "3rd vec" and "2nd vec"
6294 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6295 | New vectors |
6296
6297 So that now new vectors are:
6298
6299 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6300 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6301 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6302
6303 3rd step:
6304 shift right by 5 the concatenation of:
6305 "1st vec" and "1st vec"
6306 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6307 shift right by 3 the concatenation of:
6308 "2nd vec" and "2nd vec"
6309 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6310 | New vectors |
6311
6312 So that now all vectors are READY:
6313 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6314 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6315 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6316
6317 This algorithm is faster than one in vect_permute_load_chain if:
6318 1. "shift of a concatination" is faster than general permutation.
6319 This is usually so.
6320 2. The TARGET machine can't execute vector instructions in parallel.
6321 This is because each step of the algorithm depends on previous.
6322 The algorithm in vect_permute_load_chain is much more parallel.
6323
6324 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6325 */
6326
6327 static bool
vect_shift_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6328 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6329 unsigned int length,
6330 stmt_vec_info stmt_info,
6331 gimple_stmt_iterator *gsi,
6332 vec<tree> *result_chain)
6333 {
6334 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6335 tree perm2_mask1, perm2_mask2, perm3_mask;
6336 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6337 gimple *perm_stmt;
6338
6339 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6340 unsigned int i;
6341 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6342
6343 unsigned HOST_WIDE_INT nelt, vf;
6344 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6345 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6346 /* Not supported for variable-length vectors. */
6347 return false;
6348
6349 vec_perm_builder sel (nelt, nelt, 1);
6350 sel.quick_grow (nelt);
6351
6352 result_chain->quick_grow (length);
6353 memcpy (result_chain->address (), dr_chain.address (),
6354 length * sizeof (tree));
6355
6356 if (pow2p_hwi (length) && vf > 4)
6357 {
6358 unsigned int j, log_length = exact_log2 (length);
6359 for (i = 0; i < nelt / 2; ++i)
6360 sel[i] = i * 2;
6361 for (i = 0; i < nelt / 2; ++i)
6362 sel[nelt / 2 + i] = i * 2 + 1;
6363 vec_perm_indices indices (sel, 2, nelt);
6364 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6365 {
6366 if (dump_enabled_p ())
6367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368 "shuffle of 2 fields structure is not \
6369 supported by target\n");
6370 return false;
6371 }
6372 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6373
6374 for (i = 0; i < nelt / 2; ++i)
6375 sel[i] = i * 2 + 1;
6376 for (i = 0; i < nelt / 2; ++i)
6377 sel[nelt / 2 + i] = i * 2;
6378 indices.new_vector (sel, 2, nelt);
6379 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6380 {
6381 if (dump_enabled_p ())
6382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6383 "shuffle of 2 fields structure is not \
6384 supported by target\n");
6385 return false;
6386 }
6387 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6388
6389 /* Generating permutation constant to shift all elements.
6390 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6391 for (i = 0; i < nelt; i++)
6392 sel[i] = nelt / 2 + i;
6393 indices.new_vector (sel, 2, nelt);
6394 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6395 {
6396 if (dump_enabled_p ())
6397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398 "shift permutation is not supported by target\n");
6399 return false;
6400 }
6401 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6402
6403 /* Generating permutation constant to select vector from 2.
6404 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6405 for (i = 0; i < nelt / 2; i++)
6406 sel[i] = i;
6407 for (i = nelt / 2; i < nelt; i++)
6408 sel[i] = nelt + i;
6409 indices.new_vector (sel, 2, nelt);
6410 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6411 {
6412 if (dump_enabled_p ())
6413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414 "select is not supported by target\n");
6415 return false;
6416 }
6417 select_mask = vect_gen_perm_mask_checked (vectype, indices);
6418
6419 for (i = 0; i < log_length; i++)
6420 {
6421 for (j = 0; j < length; j += 2)
6422 {
6423 first_vect = dr_chain[j];
6424 second_vect = dr_chain[j + 1];
6425
6426 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6427 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6428 first_vect, first_vect,
6429 perm2_mask1);
6430 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6431 vect[0] = data_ref;
6432
6433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6435 second_vect, second_vect,
6436 perm2_mask2);
6437 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6438 vect[1] = data_ref;
6439
6440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6442 vect[0], vect[1], shift1_mask);
6443 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6444 (*result_chain)[j/2 + length/2] = data_ref;
6445
6446 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6447 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6448 vect[0], vect[1], select_mask);
6449 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6450 (*result_chain)[j/2] = data_ref;
6451 }
6452 memcpy (dr_chain.address (), result_chain->address (),
6453 length * sizeof (tree));
6454 }
6455 return true;
6456 }
6457 if (length == 3 && vf > 2)
6458 {
6459 unsigned int k = 0, l = 0;
6460
6461 /* Generating permutation constant to get all elements in rigth order.
6462 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6463 for (i = 0; i < nelt; i++)
6464 {
6465 if (3 * k + (l % 3) >= nelt)
6466 {
6467 k = 0;
6468 l += (3 - (nelt % 3));
6469 }
6470 sel[i] = 3 * k + (l % 3);
6471 k++;
6472 }
6473 vec_perm_indices indices (sel, 2, nelt);
6474 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6475 {
6476 if (dump_enabled_p ())
6477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6478 "shuffle of 3 fields structure is not \
6479 supported by target\n");
6480 return false;
6481 }
6482 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6483
6484 /* Generating permutation constant to shift all elements.
6485 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6486 for (i = 0; i < nelt; i++)
6487 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6488 indices.new_vector (sel, 2, nelt);
6489 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6490 {
6491 if (dump_enabled_p ())
6492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 "shift permutation is not supported by target\n");
6494 return false;
6495 }
6496 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6497
6498 /* Generating permutation constant to shift all elements.
6499 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6500 for (i = 0; i < nelt; i++)
6501 sel[i] = 2 * (nelt / 3) + 1 + i;
6502 indices.new_vector (sel, 2, nelt);
6503 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6504 {
6505 if (dump_enabled_p ())
6506 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507 "shift permutation is not supported by target\n");
6508 return false;
6509 }
6510 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6511
6512 /* Generating permutation constant to shift all elements.
6513 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
6514 for (i = 0; i < nelt; i++)
6515 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6516 indices.new_vector (sel, 2, nelt);
6517 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6518 {
6519 if (dump_enabled_p ())
6520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521 "shift permutation is not supported by target\n");
6522 return false;
6523 }
6524 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6525
6526 /* Generating permutation constant to shift all elements.
6527 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
6528 for (i = 0; i < nelt; i++)
6529 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6530 indices.new_vector (sel, 2, nelt);
6531 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6532 {
6533 if (dump_enabled_p ())
6534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535 "shift permutation is not supported by target\n");
6536 return false;
6537 }
6538 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6539
6540 for (k = 0; k < 3; k++)
6541 {
6542 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544 dr_chain[k], dr_chain[k],
6545 perm3_mask);
6546 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6547 vect[k] = data_ref;
6548 }
6549
6550 for (k = 0; k < 3; k++)
6551 {
6552 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6553 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6554 vect[k % 3], vect[(k + 1) % 3],
6555 shift1_mask);
6556 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6557 vect_shift[k] = data_ref;
6558 }
6559
6560 for (k = 0; k < 3; k++)
6561 {
6562 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6563 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6564 vect_shift[(4 - k) % 3],
6565 vect_shift[(3 - k) % 3],
6566 shift2_mask);
6567 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6568 vect[k] = data_ref;
6569 }
6570
6571 (*result_chain)[3 - (nelt % 3)] = vect[2];
6572
6573 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6574 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6575 vect[0], shift3_mask);
6576 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6577 (*result_chain)[nelt % 3] = data_ref;
6578
6579 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6580 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6581 vect[1], shift4_mask);
6582 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6583 (*result_chain)[0] = data_ref;
6584 return true;
6585 }
6586 return false;
6587 }
6588
6589 /* Function vect_transform_grouped_load.
6590
6591 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6592 to perform their permutation and ascribe the result vectorized statements to
6593 the scalar statements.
6594 */
6595
6596 void
vect_transform_grouped_load(vec_info * vinfo,stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6597 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6598 vec<tree> dr_chain,
6599 int size, gimple_stmt_iterator *gsi)
6600 {
6601 machine_mode mode;
6602 vec<tree> result_chain = vNULL;
6603
6604 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6605 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6606 vectors, that are ready for vector computation. */
6607 result_chain.create (size);
6608
6609 /* If reassociation width for vector type is 2 or greater target machine can
6610 execute 2 or more vector instructions in parallel. Otherwise try to
6611 get chain for loads group using vect_shift_permute_load_chain. */
6612 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6613 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6614 || pow2p_hwi (size)
6615 || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6616 gsi, &result_chain))
6617 vect_permute_load_chain (vinfo, dr_chain,
6618 size, stmt_info, gsi, &result_chain);
6619 vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6620 result_chain.release ();
6621 }
6622
6623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6624 generated as part of the vectorization of STMT_INFO. Assign the statement
6625 for each vector to the associated scalar statement. */
6626
6627 void
vect_record_grouped_load_vectors(vec_info *,stmt_vec_info stmt_info,vec<tree> result_chain)6628 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6629 vec<tree> result_chain)
6630 {
6631 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6632 unsigned int i, gap_count;
6633 tree tmp_data_ref;
6634
6635 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6636 Since we scan the chain starting from it's first node, their order
6637 corresponds the order of data-refs in RESULT_CHAIN. */
6638 stmt_vec_info next_stmt_info = first_stmt_info;
6639 gap_count = 1;
6640 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6641 {
6642 if (!next_stmt_info)
6643 break;
6644
6645 /* Skip the gaps. Loads created for the gaps will be removed by dead
6646 code elimination pass later. No need to check for the first stmt in
6647 the group, since it always exists.
6648 DR_GROUP_GAP is the number of steps in elements from the previous
6649 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
6650 correspond to the gaps. */
6651 if (next_stmt_info != first_stmt_info
6652 && gap_count < DR_GROUP_GAP (next_stmt_info))
6653 {
6654 gap_count++;
6655 continue;
6656 }
6657
6658 /* ??? The following needs cleanup after the removal of
6659 DR_GROUP_SAME_DR_STMT. */
6660 if (next_stmt_info)
6661 {
6662 gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6663 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6664 copies, and we put the new vector statement last. */
6665 STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6666
6667 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6668 gap_count = 1;
6669 }
6670 }
6671 }
6672
6673 /* Function vect_force_dr_alignment_p.
6674
6675 Returns whether the alignment of a DECL can be forced to be aligned
6676 on ALIGNMENT bit boundary. */
6677
6678 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6679 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6680 {
6681 if (!VAR_P (decl))
6682 return false;
6683
6684 if (decl_in_symtab_p (decl)
6685 && !symtab_node::get (decl)->can_increase_alignment_p ())
6686 return false;
6687
6688 if (TREE_STATIC (decl))
6689 return (known_le (alignment,
6690 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6691 else
6692 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6693 }
6694
6695 /* Return whether the data reference DR_INFO is supported with respect to its
6696 alignment.
6697 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6698 it is aligned, i.e., check if it is possible to vectorize it with different
6699 alignment. */
6700
6701 enum dr_alignment_support
vect_supportable_dr_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype,int misalignment)6702 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6703 tree vectype, int misalignment)
6704 {
6705 data_reference *dr = dr_info->dr;
6706 stmt_vec_info stmt_info = dr_info->stmt;
6707 machine_mode mode = TYPE_MODE (vectype);
6708 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6709 class loop *vect_loop = NULL;
6710 bool nested_in_vect_loop = false;
6711
6712 if (misalignment == 0)
6713 return dr_aligned;
6714
6715 /* For now assume all conditional loads/stores support unaligned
6716 access without any special code. */
6717 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6718 if (gimple_call_internal_p (stmt)
6719 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6720 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6721 return dr_unaligned_supported;
6722
6723 if (loop_vinfo)
6724 {
6725 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6726 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6727 }
6728
6729 /* Possibly unaligned access. */
6730
6731 /* We can choose between using the implicit realignment scheme (generating
6732 a misaligned_move stmt) and the explicit realignment scheme (generating
6733 aligned loads with a REALIGN_LOAD). There are two variants to the
6734 explicit realignment scheme: optimized, and unoptimized.
6735 We can optimize the realignment only if the step between consecutive
6736 vector loads is equal to the vector size. Since the vector memory
6737 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6738 is guaranteed that the misalignment amount remains the same throughout the
6739 execution of the vectorized loop. Therefore, we can create the
6740 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6741 at the loop preheader.
6742
6743 However, in the case of outer-loop vectorization, when vectorizing a
6744 memory access in the inner-loop nested within the LOOP that is now being
6745 vectorized, while it is guaranteed that the misalignment of the
6746 vectorized memory access will remain the same in different outer-loop
6747 iterations, it is *not* guaranteed that is will remain the same throughout
6748 the execution of the inner-loop. This is because the inner-loop advances
6749 with the original scalar step (and not in steps of VS). If the inner-loop
6750 step happens to be a multiple of VS, then the misalignment remains fixed
6751 and we can use the optimized realignment scheme. For example:
6752
6753 for (i=0; i<N; i++)
6754 for (j=0; j<M; j++)
6755 s += a[i+j];
6756
6757 When vectorizing the i-loop in the above example, the step between
6758 consecutive vector loads is 1, and so the misalignment does not remain
6759 fixed across the execution of the inner-loop, and the realignment cannot
6760 be optimized (as illustrated in the following pseudo vectorized loop):
6761
6762 for (i=0; i<N; i+=4)
6763 for (j=0; j<M; j++){
6764 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6765 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6766 // (assuming that we start from an aligned address).
6767 }
6768
6769 We therefore have to use the unoptimized realignment scheme:
6770
6771 for (i=0; i<N; i+=4)
6772 for (j=k; j<M; j+=4)
6773 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6774 // that the misalignment of the initial address is
6775 // 0).
6776
6777 The loop can then be vectorized as follows:
6778
6779 for (k=0; k<4; k++){
6780 rt = get_realignment_token (&vp[k]);
6781 for (i=0; i<N; i+=4){
6782 v1 = vp[i+k];
6783 for (j=k; j<M; j+=4){
6784 v2 = vp[i+j+VS-1];
6785 va = REALIGN_LOAD <v1,v2,rt>;
6786 vs += va;
6787 v1 = v2;
6788 }
6789 }
6790 } */
6791
6792 if (DR_IS_READ (dr))
6793 {
6794 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6795 && (!targetm.vectorize.builtin_mask_for_load
6796 || targetm.vectorize.builtin_mask_for_load ()))
6797 {
6798 /* If we are doing SLP then the accesses need not have the
6799 same alignment, instead it depends on the SLP group size. */
6800 if (loop_vinfo
6801 && STMT_SLP_TYPE (stmt_info)
6802 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6803 * (DR_GROUP_SIZE
6804 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6805 TYPE_VECTOR_SUBPARTS (vectype)))
6806 ;
6807 else if (!loop_vinfo
6808 || (nested_in_vect_loop
6809 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6810 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6811 return dr_explicit_realign;
6812 else
6813 return dr_explicit_realign_optimized;
6814 }
6815 }
6816
6817 bool is_packed = false;
6818 tree type = TREE_TYPE (DR_REF (dr));
6819 if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6820 is_packed = not_size_aligned (DR_REF (dr));
6821 if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6822 is_packed))
6823 return dr_unaligned_supported;
6824
6825 /* Unsupported. */
6826 return dr_unaligned_unsupported;
6827 }
6828