1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2019 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "tree-cfg.h" 54 #include "tree-hash-traits.h" 55 #include "vec-perm-indices.h" 56 #include "internal-fn.h" 57 58 /* Return true if load- or store-lanes optab OPTAB is implemented for 59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 60 61 static bool 62 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 63 tree vectype, unsigned HOST_WIDE_INT count) 64 { 65 machine_mode mode, array_mode; 66 bool limit_p; 67 68 mode = TYPE_MODE (vectype); 69 if (!targetm.array_mode (mode, count).exists (&array_mode)) 70 { 71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 72 limit_p = !targetm.array_mode_supported_p (mode, count); 73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 74 { 75 if (dump_enabled_p ()) 76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 77 "no array mode for %s[%wu]\n", 78 GET_MODE_NAME (mode), count); 79 return false; 80 } 81 } 82 83 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 84 { 85 if (dump_enabled_p ()) 86 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 87 "cannot use %s<%s><%s>\n", name, 88 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 89 return false; 90 } 91 92 if (dump_enabled_p ()) 93 dump_printf_loc (MSG_NOTE, vect_location, 94 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 95 GET_MODE_NAME (mode)); 96 97 return true; 98 } 99 100 101 /* Return the smallest scalar part of STMT_INFO. 102 This is used to determine the vectype of the stmt. We generally set the 103 vectype according to the type of the result (lhs). For stmts whose 104 result-type is different than the type of the arguments (e.g., demotion, 105 promotion), vectype will be reset appropriately (later). Note that we have 106 to visit the smallest datatype in this function, because that determines the 107 VF. If the smallest datatype in the loop is present only as the rhs of a 108 promotion operation - we'd miss it. 109 Such a case, where a variable of this datatype does not appear in the lhs 110 anywhere in the loop, can only occur if it's an invariant: e.g.: 111 'int_x = (int) short_inv', which we'd expect to have been optimized away by 112 invariant motion. However, we cannot rely on invariant motion to always 113 take invariants out of the loop, and so in the case of promotion we also 114 have to check the rhs. 115 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 116 types. */ 117 118 tree 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, 120 HOST_WIDE_INT *lhs_size_unit, 121 HOST_WIDE_INT *rhs_size_unit) 122 { 123 tree scalar_type = gimple_expr_type (stmt_info->stmt); 124 HOST_WIDE_INT lhs, rhs; 125 126 /* During the analysis phase, this function is called on arbitrary 127 statements that might not have scalar results. */ 128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 129 return scalar_type; 130 131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 132 133 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); 134 if (assign 135 && (gimple_assign_cast_p (assign) 136 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR 137 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR 138 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR 139 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR 140 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)) 141 { 142 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); 143 144 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 145 if (rhs < lhs) 146 scalar_type = rhs_type; 147 } 148 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt)) 149 { 150 unsigned int i = 0; 151 if (gimple_call_internal_p (call)) 152 { 153 internal_fn ifn = gimple_call_internal_fn (call); 154 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) 155 /* gimple_expr_type already picked the type of the loaded 156 or stored data. */ 157 i = ~0U; 158 else if (internal_fn_mask_index (ifn) == 0) 159 i = 1; 160 } 161 if (i < gimple_call_num_args (call)) 162 { 163 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); 164 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) 165 { 166 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 167 if (rhs < lhs) 168 scalar_type = rhs_type; 169 } 170 } 171 } 172 173 *lhs_size_unit = lhs; 174 *rhs_size_unit = rhs; 175 return scalar_type; 176 } 177 178 179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 180 tested at run-time. Return TRUE if DDR was successfully inserted. 181 Return false if versioning is not supported. */ 182 183 static opt_result 184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 185 { 186 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 187 188 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 189 return opt_result::failure_at (vect_location, 190 "will not create alias checks, as" 191 " --param vect-max-version-for-alias-checks" 192 " == 0\n"); 193 194 opt_result res 195 = runtime_alias_check_p (ddr, loop, 196 optimize_loop_nest_for_speed_p (loop)); 197 if (!res) 198 return res; 199 200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 201 return opt_result::success (); 202 } 203 204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 205 206 static void 207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 208 { 209 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 210 for (unsigned int i = 0; i < checks.length(); ++i) 211 if (checks[i] == value) 212 return; 213 214 if (dump_enabled_p ()) 215 dump_printf_loc (MSG_NOTE, vect_location, 216 "need run-time check that %T is nonzero\n", 217 value); 218 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 219 } 220 221 /* Return true if we know that the order of vectorized DR_INFO_A and 222 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and 223 DR_INFO_B. At least one of the accesses is a write. */ 224 225 static bool 226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) 227 { 228 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 229 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 230 231 /* Single statements are always kept in their original order. */ 232 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 233 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 234 return true; 235 236 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 237 SLP group are emitted at the position of the last scalar load and 238 all loads in an interleaving group are emitted at the position 239 of the first scalar load. 240 Stores in a group are emitted at the position of the last scalar store. 241 Compute that position and check whether the resulting order matches 242 the current one. 243 We have not yet decided between SLP and interleaving so we have 244 to conservatively assume both. */ 245 stmt_vec_info il_a; 246 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); 247 if (last_a) 248 { 249 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s; 250 s = DR_GROUP_NEXT_ELEMENT (s)) 251 last_a = get_later_stmt (last_a, s); 252 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) 253 { 254 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; 255 s = DR_GROUP_NEXT_ELEMENT (s)) 256 if (get_later_stmt (il_a, s) == il_a) 257 il_a = s; 258 } 259 else 260 il_a = last_a; 261 } 262 else 263 last_a = il_a = stmtinfo_a; 264 stmt_vec_info il_b; 265 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); 266 if (last_b) 267 { 268 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s; 269 s = DR_GROUP_NEXT_ELEMENT (s)) 270 last_b = get_later_stmt (last_b, s); 271 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) 272 { 273 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; 274 s = DR_GROUP_NEXT_ELEMENT (s)) 275 if (get_later_stmt (il_b, s) == il_b) 276 il_b = s; 277 } 278 else 279 il_b = last_b; 280 } 281 else 282 last_b = il_b = stmtinfo_b; 283 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a); 284 return (/* SLP */ 285 (get_later_stmt (last_a, last_b) == last_a) == a_after_b 286 /* Interleaving */ 287 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b 288 /* Mixed */ 289 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b 290 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); 291 } 292 293 /* A subroutine of vect_analyze_data_ref_dependence. Handle 294 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 295 distances. These distances are conservatively correct but they don't 296 reflect a guaranteed dependence. 297 298 Return true if this function does all the work necessary to avoid 299 an alias or false if the caller should use the dependence distances 300 to limit the vectorization factor in the usual way. LOOP_DEPTH is 301 the depth of the loop described by LOOP_VINFO and the other arguments 302 are as for vect_analyze_data_ref_dependence. */ 303 304 static bool 305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 306 loop_vec_info loop_vinfo, 307 int loop_depth, unsigned int *max_vf) 308 { 309 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 310 lambda_vector dist_v; 311 unsigned int i; 312 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 313 { 314 int dist = dist_v[loop_depth]; 315 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 316 { 317 /* If the user asserted safelen >= DIST consecutive iterations 318 can be executed concurrently, assume independence. 319 320 ??? An alternative would be to add the alias check even 321 in this case, and vectorize the fallback loop with the 322 maximum VF set to safelen. However, if the user has 323 explicitly given a length, it's less likely that that 324 would be a win. */ 325 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 326 { 327 if ((unsigned int) loop->safelen < *max_vf) 328 *max_vf = loop->safelen; 329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 330 continue; 331 } 332 333 /* For dependence distances of 2 or more, we have the option 334 of limiting VF or checking for an alias at runtime. 335 Prefer to check at runtime if we can, to avoid limiting 336 the VF unnecessarily when the bases are in fact independent. 337 338 Note that the alias checks will be removed if the VF ends up 339 being small enough. */ 340 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); 341 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); 342 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) 343 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) 344 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); 345 } 346 } 347 return true; 348 } 349 350 351 /* Function vect_analyze_data_ref_dependence. 352 353 FIXME: I needed to change the sense of the returned flag. 354 355 Return FALSE if there (might) exist a dependence between a memory-reference 356 DRA and a memory-reference DRB. When versioning for alias may check a 357 dependence at run-time, return TRUE. Adjust *MAX_VF according to 358 the data dependence. */ 359 360 static opt_result 361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 362 loop_vec_info loop_vinfo, 363 unsigned int *max_vf) 364 { 365 unsigned int i; 366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 367 struct data_reference *dra = DDR_A (ddr); 368 struct data_reference *drb = DDR_B (ddr); 369 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); 370 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); 371 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 372 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 373 lambda_vector dist_v; 374 unsigned int loop_depth; 375 376 /* In loop analysis all data references should be vectorizable. */ 377 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 378 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 379 gcc_unreachable (); 380 381 /* Independent data accesses. */ 382 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 383 return opt_result::success (); 384 385 if (dra == drb 386 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 387 return opt_result::success (); 388 389 /* We do not have to consider dependences between accesses that belong 390 to the same group, unless the stride could be smaller than the 391 group size. */ 392 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) 393 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) 394 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) 395 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 396 return opt_result::success (); 397 398 /* Even if we have an anti-dependence then, as the vectorized loop covers at 399 least two scalar iterations, there is always also a true dependence. 400 As the vectorizer does not re-order loads and stores we can ignore 401 the anti-dependence if TBAA can disambiguate both DRs similar to the 402 case with known negative distance anti-dependences (positive 403 distance anti-dependences would violate TBAA constraints). */ 404 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 405 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 406 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 407 get_alias_set (DR_REF (drb)))) 408 return opt_result::success (); 409 410 /* Unknown data dependence. */ 411 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 412 { 413 /* If user asserted safelen consecutive iterations can be 414 executed concurrently, assume independence. */ 415 if (loop->safelen >= 2) 416 { 417 if ((unsigned int) loop->safelen < *max_vf) 418 *max_vf = loop->safelen; 419 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 420 return opt_result::success (); 421 } 422 423 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 424 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 425 return opt_result::failure_at 426 (stmtinfo_a->stmt, 427 "versioning for alias not supported for: " 428 "can't determine dependence between %T and %T\n", 429 DR_REF (dra), DR_REF (drb)); 430 431 if (dump_enabled_p ()) 432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, 433 "versioning for alias required: " 434 "can't determine dependence between %T and %T\n", 435 DR_REF (dra), DR_REF (drb)); 436 437 /* Add to list of ddrs that need to be tested at run-time. */ 438 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 439 } 440 441 /* Known data dependence. */ 442 if (DDR_NUM_DIST_VECTS (ddr) == 0) 443 { 444 /* If user asserted safelen consecutive iterations can be 445 executed concurrently, assume independence. */ 446 if (loop->safelen >= 2) 447 { 448 if ((unsigned int) loop->safelen < *max_vf) 449 *max_vf = loop->safelen; 450 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 451 return opt_result::success (); 452 } 453 454 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 455 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 456 return opt_result::failure_at 457 (stmtinfo_a->stmt, 458 "versioning for alias not supported for: " 459 "bad dist vector for %T and %T\n", 460 DR_REF (dra), DR_REF (drb)); 461 462 if (dump_enabled_p ()) 463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, 464 "versioning for alias required: " 465 "bad dist vector for %T and %T\n", 466 DR_REF (dra), DR_REF (drb)); 467 /* Add to list of ddrs that need to be tested at run-time. */ 468 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 469 } 470 471 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 472 473 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 474 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 475 loop_depth, max_vf)) 476 return opt_result::success (); 477 478 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 479 { 480 int dist = dist_v[loop_depth]; 481 482 if (dump_enabled_p ()) 483 dump_printf_loc (MSG_NOTE, vect_location, 484 "dependence distance = %d.\n", dist); 485 486 if (dist == 0) 487 { 488 if (dump_enabled_p ()) 489 dump_printf_loc (MSG_NOTE, vect_location, 490 "dependence distance == 0 between %T and %T\n", 491 DR_REF (dra), DR_REF (drb)); 492 493 /* When we perform grouped accesses and perform implicit CSE 494 by detecting equal accesses and doing disambiguation with 495 runtime alias tests like for 496 .. = a[i]; 497 .. = a[i+1]; 498 a[i] = ..; 499 a[i+1] = ..; 500 *p = ..; 501 .. = a[i]; 502 .. = a[i+1]; 503 where we will end up loading { a[i], a[i+1] } once, make 504 sure that inserting group loads before the first load and 505 stores after the last store will do the right thing. 506 Similar for groups like 507 a[i] = ...; 508 ... = a[i]; 509 a[i+1] = ...; 510 where loads from the group interleave with the store. */ 511 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) 512 return opt_result::failure_at (stmtinfo_a->stmt, 513 "READ_WRITE dependence" 514 " in interleaving.\n"); 515 516 if (loop->safelen < 2) 517 { 518 tree indicator = dr_zero_step_indicator (dra); 519 if (!indicator || integer_zerop (indicator)) 520 return opt_result::failure_at (stmtinfo_a->stmt, 521 "access also has a zero step\n"); 522 else if (TREE_CODE (indicator) != INTEGER_CST) 523 vect_check_nonzero_value (loop_vinfo, indicator); 524 } 525 continue; 526 } 527 528 if (dist > 0 && DDR_REVERSED_P (ddr)) 529 { 530 /* If DDR_REVERSED_P the order of the data-refs in DDR was 531 reversed (to make distance vector positive), and the actual 532 distance is negative. */ 533 if (dump_enabled_p ()) 534 dump_printf_loc (MSG_NOTE, vect_location, 535 "dependence distance negative.\n"); 536 /* When doing outer loop vectorization, we need to check if there is 537 a backward dependence at the inner loop level if the dependence 538 at the outer loop is reversed. See PR81740. */ 539 if (nested_in_vect_loop_p (loop, stmtinfo_a) 540 || nested_in_vect_loop_p (loop, stmtinfo_b)) 541 { 542 unsigned inner_depth = index_in_loop_nest (loop->inner->num, 543 DDR_LOOP_NEST (ddr)); 544 if (dist_v[inner_depth] < 0) 545 return opt_result::failure_at (stmtinfo_a->stmt, 546 "not vectorized, dependence " 547 "between data-refs %T and %T\n", 548 DR_REF (dra), DR_REF (drb)); 549 } 550 /* Record a negative dependence distance to later limit the 551 amount of stmt copying / unrolling we can perform. 552 Only need to handle read-after-write dependence. */ 553 if (DR_IS_READ (drb) 554 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 555 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 556 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 557 continue; 558 } 559 560 unsigned int abs_dist = abs (dist); 561 if (abs_dist >= 2 && abs_dist < *max_vf) 562 { 563 /* The dependence distance requires reduction of the maximal 564 vectorization factor. */ 565 *max_vf = abs_dist; 566 if (dump_enabled_p ()) 567 dump_printf_loc (MSG_NOTE, vect_location, 568 "adjusting maximal vectorization factor to %i\n", 569 *max_vf); 570 } 571 572 if (abs_dist >= *max_vf) 573 { 574 /* Dependence distance does not create dependence, as far as 575 vectorization is concerned, in this case. */ 576 if (dump_enabled_p ()) 577 dump_printf_loc (MSG_NOTE, vect_location, 578 "dependence distance >= VF.\n"); 579 continue; 580 } 581 582 return opt_result::failure_at (stmtinfo_a->stmt, 583 "not vectorized, possible dependence " 584 "between data-refs %T and %T\n", 585 DR_REF (dra), DR_REF (drb)); 586 } 587 588 return opt_result::success (); 589 } 590 591 /* Function vect_analyze_data_ref_dependences. 592 593 Examine all the data references in the loop, and make sure there do not 594 exist any data dependences between them. Set *MAX_VF according to 595 the maximum vectorization factor the data dependences allow. */ 596 597 opt_result 598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 599 unsigned int *max_vf) 600 { 601 unsigned int i; 602 struct data_dependence_relation *ddr; 603 604 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences"); 605 606 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) 607 { 608 LOOP_VINFO_DDRS (loop_vinfo) 609 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 610 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 611 /* We need read-read dependences to compute 612 STMT_VINFO_SAME_ALIGN_REFS. */ 613 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 614 &LOOP_VINFO_DDRS (loop_vinfo), 615 LOOP_VINFO_LOOP_NEST (loop_vinfo), 616 true); 617 gcc_assert (res); 618 } 619 620 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 621 622 /* For epilogues we either have no aliases or alias versioning 623 was applied to original loop. Therefore we may just get max_vf 624 using VF of original loop. */ 625 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 626 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 627 else 628 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 629 { 630 opt_result res 631 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); 632 if (!res) 633 return res; 634 } 635 636 return opt_result::success (); 637 } 638 639 640 /* Function vect_slp_analyze_data_ref_dependence. 641 642 Return TRUE if there (might) exist a dependence between a memory-reference 643 DRA and a memory-reference DRB for VINFO. When versioning for alias 644 may check a dependence at run-time, return FALSE. Adjust *MAX_VF 645 according to the data dependence. */ 646 647 static bool 648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo, 649 struct data_dependence_relation *ddr) 650 { 651 struct data_reference *dra = DDR_A (ddr); 652 struct data_reference *drb = DDR_B (ddr); 653 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 654 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 655 656 /* We need to check dependences of statements marked as unvectorizable 657 as well, they still can prohibit vectorization. */ 658 659 /* Independent data accesses. */ 660 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 661 return false; 662 663 if (dra == drb) 664 return false; 665 666 /* Read-read is OK. */ 667 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 668 return false; 669 670 /* If dra and drb are part of the same interleaving chain consider 671 them independent. */ 672 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) 673 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) 674 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) 675 return false; 676 677 /* Unknown data dependence. */ 678 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 679 { 680 if (dump_enabled_p ()) 681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 682 "can't determine dependence between %T and %T\n", 683 DR_REF (dra), DR_REF (drb)); 684 } 685 else if (dump_enabled_p ()) 686 dump_printf_loc (MSG_NOTE, vect_location, 687 "determined dependence between %T and %T\n", 688 DR_REF (dra), DR_REF (drb)); 689 690 return true; 691 } 692 693 694 /* Analyze dependences involved in the transform of SLP NODE. STORES 695 contain the vector of scalar stores of this instance if we are 696 disambiguating the loads. */ 697 698 static bool 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 700 vec<stmt_vec_info> stores, 701 stmt_vec_info last_store_info) 702 { 703 /* This walks over all stmts involved in the SLP load/store done 704 in NODE verifying we can sink them up to the last stmt in the 705 group. */ 706 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); 707 vec_info *vinfo = last_access_info->vinfo; 708 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 709 { 710 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k]; 711 if (access_info == last_access_info) 712 continue; 713 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); 714 ao_ref ref; 715 bool ref_initialized_p = false; 716 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); 717 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi)) 718 { 719 gimple *stmt = gsi_stmt (gsi); 720 if (! gimple_vuse (stmt) 721 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 722 continue; 723 724 /* If we couldn't record a (single) data reference for this 725 stmt we have to resort to the alias oracle. */ 726 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); 727 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); 728 if (!dr_b) 729 { 730 /* We are moving a store or sinking a load - this means 731 we cannot use TBAA for disambiguation. */ 732 if (!ref_initialized_p) 733 ao_ref_init (&ref, DR_REF (dr_a)); 734 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) 735 || ref_maybe_used_by_stmt_p (stmt, &ref, false)) 736 return false; 737 continue; 738 } 739 740 bool dependent = false; 741 /* If we run into a store of this same instance (we've just 742 marked those) then delay dependence checking until we run 743 into the last store because this is where it will have 744 been sunk to (and we verify if we can do that as well). */ 745 if (gimple_visited_p (stmt)) 746 { 747 if (stmt_info != last_store_info) 748 continue; 749 unsigned i; 750 stmt_vec_info store_info; 751 FOR_EACH_VEC_ELT (stores, i, store_info) 752 { 753 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info); 754 ddr_p ddr = initialize_data_dependence_relation 755 (dr_a, store_dr, vNULL); 756 dependent 757 = vect_slp_analyze_data_ref_dependence (vinfo, ddr); 758 free_dependence_relation (ddr); 759 if (dependent) 760 break; 761 } 762 } 763 else 764 { 765 ddr_p ddr = initialize_data_dependence_relation (dr_a, 766 dr_b, vNULL); 767 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); 768 free_dependence_relation (ddr); 769 } 770 if (dependent) 771 return false; 772 } 773 } 774 return true; 775 } 776 777 778 /* Function vect_analyze_data_ref_dependences. 779 780 Examine all the data references in the basic-block, and make sure there 781 do not exist any data dependences between them. Set *MAX_VF according to 782 the maximum vectorization factor the data dependences allow. */ 783 784 bool 785 vect_slp_analyze_instance_dependence (slp_instance instance) 786 { 787 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence"); 788 789 /* The stores of this instance are at the root of the SLP tree. */ 790 slp_tree store = SLP_INSTANCE_TREE (instance); 791 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0])) 792 store = NULL; 793 794 /* Verify we can sink stores to the vectorized stmt insert location. */ 795 stmt_vec_info last_store_info = NULL; 796 if (store) 797 { 798 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 799 return false; 800 801 /* Mark stores in this instance and remember the last one. */ 802 last_store_info = vect_find_last_scalar_stmt_in_slp (store); 803 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 804 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true); 805 } 806 807 bool res = true; 808 809 /* Verify we can sink loads to the vectorized stmt insert location, 810 special-casing stores of this instance. */ 811 slp_tree load; 812 unsigned int i; 813 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 814 if (! vect_slp_analyze_node_dependences (instance, load, 815 store 816 ? SLP_TREE_SCALAR_STMTS (store) 817 : vNULL, last_store_info)) 818 { 819 res = false; 820 break; 821 } 822 823 /* Unset the visited flag. */ 824 if (store) 825 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 826 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false); 827 828 return res; 829 } 830 831 /* Record the base alignment guarantee given by DRB, which occurs 832 in STMT_INFO. */ 833 834 static void 835 vect_record_base_alignment (stmt_vec_info stmt_info, 836 innermost_loop_behavior *drb) 837 { 838 vec_info *vinfo = stmt_info->vinfo; 839 bool existed; 840 innermost_loop_behavior *&entry 841 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 842 if (!existed || entry->base_alignment < drb->base_alignment) 843 { 844 entry = drb; 845 if (dump_enabled_p ()) 846 dump_printf_loc (MSG_NOTE, vect_location, 847 "recording new base alignment for %T\n" 848 " alignment: %d\n" 849 " misalignment: %d\n" 850 " based on: %G", 851 drb->base_address, 852 drb->base_alignment, 853 drb->base_misalignment, 854 stmt_info->stmt); 855 } 856 } 857 858 /* If the region we're going to vectorize is reached, all unconditional 859 data references occur at least once. We can therefore pool the base 860 alignment guarantees from each unconditional reference. Do this by 861 going through all the data references in VINFO and checking whether 862 the containing statement makes the reference unconditionally. If so, 863 record the alignment of the base address in VINFO so that it can be 864 used for all other references with the same base. */ 865 866 void 867 vect_record_base_alignments (vec_info *vinfo) 868 { 869 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 870 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 871 data_reference *dr; 872 unsigned int i; 873 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr) 874 { 875 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 876 stmt_vec_info stmt_info = dr_info->stmt; 877 if (!DR_IS_CONDITIONAL_IN_STMT (dr) 878 && STMT_VINFO_VECTORIZABLE (stmt_info) 879 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 880 { 881 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr)); 882 883 /* If DR is nested in the loop that is being vectorized, we can also 884 record the alignment of the base wrt the outer loop. */ 885 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 886 vect_record_base_alignment 887 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 888 } 889 } 890 } 891 892 /* Return the target alignment for the vectorized form of DR_INFO. */ 893 894 static poly_uint64 895 vect_calculate_target_alignment (dr_vec_info *dr_info) 896 { 897 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); 898 return targetm.vectorize.preferred_vector_alignment (vectype); 899 } 900 901 /* Function vect_compute_data_ref_alignment 902 903 Compute the misalignment of the data reference DR_INFO. 904 905 Output: 906 1. DR_MISALIGNMENT (DR_INFO) is defined. 907 908 FOR NOW: No analysis is actually performed. Misalignment is calculated 909 only for trivial cases. TODO. */ 910 911 static void 912 vect_compute_data_ref_alignment (dr_vec_info *dr_info) 913 { 914 stmt_vec_info stmt_info = dr_info->stmt; 915 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 916 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 917 struct loop *loop = NULL; 918 tree ref = DR_REF (dr_info->dr); 919 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 920 921 if (dump_enabled_p ()) 922 dump_printf_loc (MSG_NOTE, vect_location, 923 "vect_compute_data_ref_alignment:\n"); 924 925 if (loop_vinfo) 926 loop = LOOP_VINFO_LOOP (loop_vinfo); 927 928 /* Initialize misalignment to unknown. */ 929 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); 930 931 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 932 return; 933 934 innermost_loop_behavior *drb = vect_dr_behavior (dr_info); 935 bool step_preserves_misalignment_p; 936 937 poly_uint64 vector_alignment 938 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT); 939 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment; 940 941 unsigned HOST_WIDE_INT vect_align_c; 942 if (!vector_alignment.is_constant (&vect_align_c)) 943 return; 944 945 /* No step for BB vectorization. */ 946 if (!loop) 947 { 948 gcc_assert (integer_zerop (drb->step)); 949 step_preserves_misalignment_p = true; 950 } 951 952 /* In case the dataref is in an inner-loop of the loop that is being 953 vectorized (LOOP), we use the base and misalignment information 954 relative to the outer-loop (LOOP). This is ok only if the misalignment 955 stays the same throughout the execution of the inner-loop, which is why 956 we have to check that the stride of the dataref in the inner-loop evenly 957 divides by the vector alignment. */ 958 else if (nested_in_vect_loop_p (loop, stmt_info)) 959 { 960 step_preserves_misalignment_p 961 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; 962 963 if (dump_enabled_p ()) 964 { 965 if (step_preserves_misalignment_p) 966 dump_printf_loc (MSG_NOTE, vect_location, 967 "inner step divides the vector alignment.\n"); 968 else 969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 970 "inner step doesn't divide the vector" 971 " alignment.\n"); 972 } 973 } 974 975 /* Similarly we can only use base and misalignment information relative to 976 an innermost loop if the misalignment stays the same throughout the 977 execution of the loop. As above, this is the case if the stride of 978 the dataref evenly divides by the alignment. */ 979 else 980 { 981 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 982 step_preserves_misalignment_p 983 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c); 984 985 if (!step_preserves_misalignment_p && dump_enabled_p ()) 986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 987 "step doesn't divide the vector alignment.\n"); 988 } 989 990 unsigned int base_alignment = drb->base_alignment; 991 unsigned int base_misalignment = drb->base_misalignment; 992 993 /* Calculate the maximum of the pooled base address alignment and the 994 alignment that we can compute for DR itself. */ 995 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 996 if (entry && base_alignment < (*entry)->base_alignment) 997 { 998 base_alignment = (*entry)->base_alignment; 999 base_misalignment = (*entry)->base_misalignment; 1000 } 1001 1002 if (drb->offset_alignment < vect_align_c 1003 || !step_preserves_misalignment_p 1004 /* We need to know whether the step wrt the vectorized loop is 1005 negative when computing the starting misalignment below. */ 1006 || TREE_CODE (drb->step) != INTEGER_CST) 1007 { 1008 if (dump_enabled_p ()) 1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1010 "Unknown alignment for access: %T\n", ref); 1011 return; 1012 } 1013 1014 if (base_alignment < vect_align_c) 1015 { 1016 unsigned int max_alignment; 1017 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 1018 if (max_alignment < vect_align_c 1019 || !vect_can_force_dr_alignment_p (base, 1020 vect_align_c * BITS_PER_UNIT)) 1021 { 1022 if (dump_enabled_p ()) 1023 dump_printf_loc (MSG_NOTE, vect_location, 1024 "can't force alignment of ref: %T\n", ref); 1025 return; 1026 } 1027 1028 /* Force the alignment of the decl. 1029 NOTE: This is the only change to the code we make during 1030 the analysis phase, before deciding to vectorize the loop. */ 1031 if (dump_enabled_p ()) 1032 dump_printf_loc (MSG_NOTE, vect_location, 1033 "force alignment of %T\n", ref); 1034 1035 dr_info->base_decl = base; 1036 dr_info->base_misaligned = true; 1037 base_misalignment = 0; 1038 } 1039 poly_int64 misalignment 1040 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 1041 1042 /* If this is a backward running DR then first access in the larger 1043 vectype actually is N-1 elements before the address in the DR. 1044 Adjust misalign accordingly. */ 1045 if (tree_int_cst_sgn (drb->step) < 0) 1046 /* PLUS because STEP is negative. */ 1047 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1048 * TREE_INT_CST_LOW (drb->step)); 1049 1050 unsigned int const_misalignment; 1051 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment)) 1052 { 1053 if (dump_enabled_p ()) 1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1055 "Non-constant misalignment for access: %T\n", ref); 1056 return; 1057 } 1058 1059 SET_DR_MISALIGNMENT (dr_info, const_misalignment); 1060 1061 if (dump_enabled_p ()) 1062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1063 "misalign = %d bytes of ref %T\n", 1064 DR_MISALIGNMENT (dr_info), ref); 1065 1066 return; 1067 } 1068 1069 /* Function vect_update_misalignment_for_peel. 1070 Sets DR_INFO's misalignment 1071 - to 0 if it has the same alignment as DR_PEEL_INFO, 1072 - to the misalignment computed using NPEEL if DR_INFO's salignment is known, 1073 - to -1 (unknown) otherwise. 1074 1075 DR_INFO - the data reference whose misalignment is to be adjusted. 1076 DR_PEEL_INFO - the data reference whose misalignment is being made 1077 zero in the vector loop by the peel. 1078 NPEEL - the number of iterations in the peel loop if the misalignment 1079 of DR_PEEL_INFO is known at compile time. */ 1080 1081 static void 1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info, 1083 dr_vec_info *dr_peel_info, int npeel) 1084 { 1085 unsigned int i; 1086 vec<dr_p> same_aligned_drs; 1087 struct data_reference *current_dr; 1088 stmt_vec_info peel_stmt_info = dr_peel_info->stmt; 1089 1090 /* It can be assumed that if dr_info has the same alignment as dr_peel, 1091 it is aligned in the vector loop. */ 1092 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info); 1093 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1094 { 1095 if (current_dr != dr_info->dr) 1096 continue; 1097 gcc_assert (!known_alignment_for_access_p (dr_info) 1098 || !known_alignment_for_access_p (dr_peel_info) 1099 || (DR_MISALIGNMENT (dr_info) 1100 == DR_MISALIGNMENT (dr_peel_info))); 1101 SET_DR_MISALIGNMENT (dr_info, 0); 1102 return; 1103 } 1104 1105 unsigned HOST_WIDE_INT alignment; 1106 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment) 1107 && known_alignment_for_access_p (dr_info) 1108 && known_alignment_for_access_p (dr_peel_info)) 1109 { 1110 int misal = DR_MISALIGNMENT (dr_info); 1111 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); 1112 misal &= alignment - 1; 1113 SET_DR_MISALIGNMENT (dr_info, misal); 1114 return; 1115 } 1116 1117 if (dump_enabled_p ()) 1118 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1119 "to unknown (-1).\n"); 1120 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); 1121 } 1122 1123 1124 /* Function verify_data_ref_alignment 1125 1126 Return TRUE if DR_INFO can be handled with respect to alignment. */ 1127 1128 static opt_result 1129 verify_data_ref_alignment (dr_vec_info *dr_info) 1130 { 1131 enum dr_alignment_support supportable_dr_alignment 1132 = vect_supportable_dr_alignment (dr_info, false); 1133 if (!supportable_dr_alignment) 1134 return opt_result::failure_at 1135 (dr_info->stmt->stmt, 1136 DR_IS_READ (dr_info->dr) 1137 ? "not vectorized: unsupported unaligned load: %T\n" 1138 : "not vectorized: unsupported unaligned store: %T\n", 1139 DR_REF (dr_info->dr)); 1140 1141 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1142 dump_printf_loc (MSG_NOTE, vect_location, 1143 "Vectorizing an unaligned access.\n"); 1144 1145 return opt_result::success (); 1146 } 1147 1148 /* Function vect_verify_datarefs_alignment 1149 1150 Return TRUE if all data references in the loop can be 1151 handled with respect to alignment. */ 1152 1153 opt_result 1154 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1155 { 1156 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 1157 struct data_reference *dr; 1158 unsigned int i; 1159 1160 FOR_EACH_VEC_ELT (datarefs, i, dr) 1161 { 1162 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 1163 stmt_vec_info stmt_info = dr_info->stmt; 1164 1165 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1166 continue; 1167 1168 /* For interleaving, only the alignment of the first access matters. */ 1169 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1170 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1171 continue; 1172 1173 /* Strided accesses perform only component accesses, alignment is 1174 irrelevant for them. */ 1175 if (STMT_VINFO_STRIDED_P (stmt_info) 1176 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1177 continue; 1178 1179 opt_result res = verify_data_ref_alignment (dr_info); 1180 if (!res) 1181 return res; 1182 } 1183 1184 return opt_result::success (); 1185 } 1186 1187 /* Given an memory reference EXP return whether its alignment is less 1188 than its size. */ 1189 1190 static bool 1191 not_size_aligned (tree exp) 1192 { 1193 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1194 return true; 1195 1196 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1197 > get_object_alignment (exp)); 1198 } 1199 1200 /* Function vector_alignment_reachable_p 1201 1202 Return true if vector alignment for DR_INFO is reachable by peeling 1203 a few loop iterations. Return false otherwise. */ 1204 1205 static bool 1206 vector_alignment_reachable_p (dr_vec_info *dr_info) 1207 { 1208 stmt_vec_info stmt_info = dr_info->stmt; 1209 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1210 1211 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1212 { 1213 /* For interleaved access we peel only if number of iterations in 1214 the prolog loop ({VF - misalignment}), is a multiple of the 1215 number of the interleaved accesses. */ 1216 int elem_size, mis_in_elements; 1217 1218 /* FORNOW: handle only known alignment. */ 1219 if (!known_alignment_for_access_p (dr_info)) 1220 return false; 1221 1222 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1223 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1224 elem_size = vector_element_size (vector_size, nelements); 1225 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size; 1226 1227 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) 1228 return false; 1229 } 1230 1231 /* If misalignment is known at the compile time then allow peeling 1232 only if natural alignment is reachable through peeling. */ 1233 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info)) 1234 { 1235 HOST_WIDE_INT elmsize = 1236 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1237 if (dump_enabled_p ()) 1238 { 1239 dump_printf_loc (MSG_NOTE, vect_location, 1240 "data size = %wd. misalignment = %d.\n", elmsize, 1241 DR_MISALIGNMENT (dr_info)); 1242 } 1243 if (DR_MISALIGNMENT (dr_info) % elmsize) 1244 { 1245 if (dump_enabled_p ()) 1246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1247 "data size does not divide the misalignment.\n"); 1248 return false; 1249 } 1250 } 1251 1252 if (!known_alignment_for_access_p (dr_info)) 1253 { 1254 tree type = TREE_TYPE (DR_REF (dr_info->dr)); 1255 bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); 1256 if (dump_enabled_p ()) 1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1258 "Unknown misalignment, %snaturally aligned\n", 1259 is_packed ? "not " : ""); 1260 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1261 } 1262 1263 return true; 1264 } 1265 1266 1267 /* Calculate the cost of the memory access represented by DR_INFO. */ 1268 1269 static void 1270 vect_get_data_access_cost (dr_vec_info *dr_info, 1271 unsigned int *inside_cost, 1272 unsigned int *outside_cost, 1273 stmt_vector_for_cost *body_cost_vec, 1274 stmt_vector_for_cost *prologue_cost_vec) 1275 { 1276 stmt_vec_info stmt_info = dr_info->stmt; 1277 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1278 int ncopies; 1279 1280 if (PURE_SLP_STMT (stmt_info)) 1281 ncopies = 1; 1282 else 1283 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1284 1285 if (DR_IS_READ (dr_info->dr)) 1286 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost, 1287 prologue_cost_vec, body_cost_vec, false); 1288 else 1289 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec); 1290 1291 if (dump_enabled_p ()) 1292 dump_printf_loc (MSG_NOTE, vect_location, 1293 "vect_get_data_access_cost: inside_cost = %d, " 1294 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1295 } 1296 1297 1298 typedef struct _vect_peel_info 1299 { 1300 dr_vec_info *dr_info; 1301 int npeel; 1302 unsigned int count; 1303 } *vect_peel_info; 1304 1305 typedef struct _vect_peel_extended_info 1306 { 1307 struct _vect_peel_info peel_info; 1308 unsigned int inside_cost; 1309 unsigned int outside_cost; 1310 } *vect_peel_extended_info; 1311 1312 1313 /* Peeling hashtable helpers. */ 1314 1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1316 { 1317 static inline hashval_t hash (const _vect_peel_info *); 1318 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1319 }; 1320 1321 inline hashval_t 1322 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1323 { 1324 return (hashval_t) peel_info->npeel; 1325 } 1326 1327 inline bool 1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1329 { 1330 return (a->npeel == b->npeel); 1331 } 1332 1333 1334 /* Insert DR_INFO into peeling hash table with NPEEL as key. */ 1335 1336 static void 1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1338 loop_vec_info loop_vinfo, dr_vec_info *dr_info, 1339 int npeel) 1340 { 1341 struct _vect_peel_info elem, *slot; 1342 _vect_peel_info **new_slot; 1343 bool supportable_dr_alignment 1344 = vect_supportable_dr_alignment (dr_info, true); 1345 1346 elem.npeel = npeel; 1347 slot = peeling_htab->find (&elem); 1348 if (slot) 1349 slot->count++; 1350 else 1351 { 1352 slot = XNEW (struct _vect_peel_info); 1353 slot->npeel = npeel; 1354 slot->dr_info = dr_info; 1355 slot->count = 1; 1356 new_slot = peeling_htab->find_slot (slot, INSERT); 1357 *new_slot = slot; 1358 } 1359 1360 if (!supportable_dr_alignment 1361 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1362 slot->count += VECT_MAX_COST; 1363 } 1364 1365 1366 /* Traverse peeling hash table to find peeling option that aligns maximum 1367 number of data accesses. */ 1368 1369 int 1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1371 _vect_peel_extended_info *max) 1372 { 1373 vect_peel_info elem = *slot; 1374 1375 if (elem->count > max->peel_info.count 1376 || (elem->count == max->peel_info.count 1377 && max->peel_info.npeel > elem->npeel)) 1378 { 1379 max->peel_info.npeel = elem->npeel; 1380 max->peel_info.count = elem->count; 1381 max->peel_info.dr_info = elem->dr_info; 1382 } 1383 1384 return 1; 1385 } 1386 1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking 1388 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, 1389 we assume DR0_INFO's misalignment will be zero after peeling. */ 1390 1391 static void 1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, 1393 dr_vec_info *dr0_info, 1394 unsigned int *inside_cost, 1395 unsigned int *outside_cost, 1396 stmt_vector_for_cost *body_cost_vec, 1397 stmt_vector_for_cost *prologue_cost_vec, 1398 unsigned int npeel, 1399 bool unknown_misalignment) 1400 { 1401 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1402 unsigned i; 1403 data_reference *dr; 1404 1405 FOR_EACH_VEC_ELT (datarefs, i, dr) 1406 { 1407 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1408 stmt_vec_info stmt_info = dr_info->stmt; 1409 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1410 continue; 1411 1412 /* For interleaving, only the alignment of the first access 1413 matters. */ 1414 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1415 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1416 continue; 1417 1418 /* Strided accesses perform only component accesses, alignment is 1419 irrelevant for them. */ 1420 if (STMT_VINFO_STRIDED_P (stmt_info) 1421 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1422 continue; 1423 1424 int save_misalignment; 1425 save_misalignment = DR_MISALIGNMENT (dr_info); 1426 if (npeel == 0) 1427 ; 1428 else if (unknown_misalignment && dr_info == dr0_info) 1429 SET_DR_MISALIGNMENT (dr_info, 0); 1430 else 1431 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 1432 vect_get_data_access_cost (dr_info, inside_cost, outside_cost, 1433 body_cost_vec, prologue_cost_vec); 1434 SET_DR_MISALIGNMENT (dr_info, save_misalignment); 1435 } 1436 } 1437 1438 /* Traverse peeling hash table and calculate cost for each peeling option. 1439 Find the one with the lowest cost. */ 1440 1441 int 1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1443 _vect_peel_extended_info *min) 1444 { 1445 vect_peel_info elem = *slot; 1446 int dummy; 1447 unsigned int inside_cost = 0, outside_cost = 0; 1448 stmt_vec_info stmt_info = elem->dr_info->stmt; 1449 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1450 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1451 epilogue_cost_vec; 1452 1453 prologue_cost_vec.create (2); 1454 body_cost_vec.create (2); 1455 epilogue_cost_vec.create (2); 1456 1457 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost, 1458 &outside_cost, &body_cost_vec, 1459 &prologue_cost_vec, elem->npeel, false); 1460 1461 body_cost_vec.release (); 1462 1463 outside_cost += vect_get_known_peeling_cost 1464 (loop_vinfo, elem->npeel, &dummy, 1465 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1466 &prologue_cost_vec, &epilogue_cost_vec); 1467 1468 /* Prologue and epilogue costs are added to the target model later. 1469 These costs depend only on the scalar iteration cost, the 1470 number of peeling iterations finally chosen, and the number of 1471 misaligned statements. So discard the information found here. */ 1472 prologue_cost_vec.release (); 1473 epilogue_cost_vec.release (); 1474 1475 if (inside_cost < min->inside_cost 1476 || (inside_cost == min->inside_cost 1477 && outside_cost < min->outside_cost)) 1478 { 1479 min->inside_cost = inside_cost; 1480 min->outside_cost = outside_cost; 1481 min->peel_info.dr_info = elem->dr_info; 1482 min->peel_info.npeel = elem->npeel; 1483 min->peel_info.count = elem->count; 1484 } 1485 1486 return 1; 1487 } 1488 1489 1490 /* Choose best peeling option by traversing peeling hash table and either 1491 choosing an option with the lowest cost (if cost model is enabled) or the 1492 option that aligns as many accesses as possible. */ 1493 1494 static struct _vect_peel_extended_info 1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1496 loop_vec_info loop_vinfo) 1497 { 1498 struct _vect_peel_extended_info res; 1499 1500 res.peel_info.dr_info = NULL; 1501 1502 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1503 { 1504 res.inside_cost = INT_MAX; 1505 res.outside_cost = INT_MAX; 1506 peeling_htab->traverse <_vect_peel_extended_info *, 1507 vect_peeling_hash_get_lowest_cost> (&res); 1508 } 1509 else 1510 { 1511 res.peel_info.count = 0; 1512 peeling_htab->traverse <_vect_peel_extended_info *, 1513 vect_peeling_hash_get_most_frequent> (&res); 1514 res.inside_cost = 0; 1515 res.outside_cost = 0; 1516 } 1517 1518 return res; 1519 } 1520 1521 /* Return true if the new peeling NPEEL is supported. */ 1522 1523 static bool 1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, 1525 unsigned npeel) 1526 { 1527 unsigned i; 1528 struct data_reference *dr = NULL; 1529 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1530 enum dr_alignment_support supportable_dr_alignment; 1531 1532 /* Ensure that all data refs can be vectorized after the peel. */ 1533 FOR_EACH_VEC_ELT (datarefs, i, dr) 1534 { 1535 int save_misalignment; 1536 1537 if (dr == dr0_info->dr) 1538 continue; 1539 1540 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1541 stmt_vec_info stmt_info = dr_info->stmt; 1542 /* For interleaving, only the alignment of the first access 1543 matters. */ 1544 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1545 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1546 continue; 1547 1548 /* Strided accesses perform only component accesses, alignment is 1549 irrelevant for them. */ 1550 if (STMT_VINFO_STRIDED_P (stmt_info) 1551 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1552 continue; 1553 1554 save_misalignment = DR_MISALIGNMENT (dr_info); 1555 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 1556 supportable_dr_alignment 1557 = vect_supportable_dr_alignment (dr_info, false); 1558 SET_DR_MISALIGNMENT (dr_info, save_misalignment); 1559 1560 if (!supportable_dr_alignment) 1561 return false; 1562 } 1563 1564 return true; 1565 } 1566 1567 /* Function vect_enhance_data_refs_alignment 1568 1569 This pass will use loop versioning and loop peeling in order to enhance 1570 the alignment of data references in the loop. 1571 1572 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1573 original loop is to be vectorized. Any other loops that are created by 1574 the transformations performed in this pass - are not supposed to be 1575 vectorized. This restriction will be relaxed. 1576 1577 This pass will require a cost model to guide it whether to apply peeling 1578 or versioning or a combination of the two. For example, the scheme that 1579 intel uses when given a loop with several memory accesses, is as follows: 1580 choose one memory access ('p') which alignment you want to force by doing 1581 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1582 other accesses are not necessarily aligned, or (2) use loop versioning to 1583 generate one loop in which all accesses are aligned, and another loop in 1584 which only 'p' is necessarily aligned. 1585 1586 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1587 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1588 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1589 1590 Devising a cost model is the most critical aspect of this work. It will 1591 guide us on which access to peel for, whether to use loop versioning, how 1592 many versions to create, etc. The cost model will probably consist of 1593 generic considerations as well as target specific considerations (on 1594 powerpc for example, misaligned stores are more painful than misaligned 1595 loads). 1596 1597 Here are the general steps involved in alignment enhancements: 1598 1599 -- original loop, before alignment analysis: 1600 for (i=0; i<N; i++){ 1601 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1602 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1603 } 1604 1605 -- After vect_compute_data_refs_alignment: 1606 for (i=0; i<N; i++){ 1607 x = q[i]; # DR_MISALIGNMENT(q) = 3 1608 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1609 } 1610 1611 -- Possibility 1: we do loop versioning: 1612 if (p is aligned) { 1613 for (i=0; i<N; i++){ # loop 1A 1614 x = q[i]; # DR_MISALIGNMENT(q) = 3 1615 p[i] = y; # DR_MISALIGNMENT(p) = 0 1616 } 1617 } 1618 else { 1619 for (i=0; i<N; i++){ # loop 1B 1620 x = q[i]; # DR_MISALIGNMENT(q) = 3 1621 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1622 } 1623 } 1624 1625 -- Possibility 2: we do loop peeling: 1626 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1627 x = q[i]; 1628 p[i] = y; 1629 } 1630 for (i = 3; i < N; i++){ # loop 2A 1631 x = q[i]; # DR_MISALIGNMENT(q) = 0 1632 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1633 } 1634 1635 -- Possibility 3: combination of loop peeling and versioning: 1636 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1637 x = q[i]; 1638 p[i] = y; 1639 } 1640 if (p is aligned) { 1641 for (i = 3; i<N; i++){ # loop 3A 1642 x = q[i]; # DR_MISALIGNMENT(q) = 0 1643 p[i] = y; # DR_MISALIGNMENT(p) = 0 1644 } 1645 } 1646 else { 1647 for (i = 3; i<N; i++){ # loop 3B 1648 x = q[i]; # DR_MISALIGNMENT(q) = 0 1649 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1650 } 1651 } 1652 1653 These loops are later passed to loop_transform to be vectorized. The 1654 vectorizer will use the alignment information to guide the transformation 1655 (whether to generate regular loads/stores, or with special handling for 1656 misalignment). */ 1657 1658 opt_result 1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1660 { 1661 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1662 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1663 enum dr_alignment_support supportable_dr_alignment; 1664 dr_vec_info *first_store = NULL; 1665 dr_vec_info *dr0_info = NULL; 1666 struct data_reference *dr; 1667 unsigned int i, j; 1668 bool do_peeling = false; 1669 bool do_versioning = false; 1670 unsigned int npeel = 0; 1671 bool one_misalignment_known = false; 1672 bool one_misalignment_unknown = false; 1673 bool one_dr_unsupportable = false; 1674 dr_vec_info *unsupportable_dr_info = NULL; 1675 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1676 unsigned possible_npeel_number = 1; 1677 tree vectype; 1678 unsigned int mis, same_align_drs_max = 0; 1679 hash_table<peel_info_hasher> peeling_htab (1); 1680 1681 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment"); 1682 1683 /* Reset data so we can safely be called multiple times. */ 1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1686 1687 /* While cost model enhancements are expected in the future, the high level 1688 view of the code at this time is as follows: 1689 1690 A) If there is a misaligned access then see if peeling to align 1691 this access can make all data references satisfy 1692 vect_supportable_dr_alignment. If so, update data structures 1693 as needed and return true. 1694 1695 B) If peeling wasn't possible and there is a data reference with an 1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1697 then see if loop versioning checks can be used to make all data 1698 references satisfy vect_supportable_dr_alignment. If so, update 1699 data structures as needed and return true. 1700 1701 C) If neither peeling nor versioning were successful then return false if 1702 any data reference does not satisfy vect_supportable_dr_alignment. 1703 1704 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1705 1706 Note, Possibility 3 above (which is peeling and versioning together) is not 1707 being done at this time. */ 1708 1709 /* (1) Peeling to force alignment. */ 1710 1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1712 Considerations: 1713 + How many accesses will become aligned due to the peeling 1714 - How many accesses will become unaligned due to the peeling, 1715 and the cost of misaligned accesses. 1716 - The cost of peeling (the extra runtime checks, the increase 1717 in code size). */ 1718 1719 FOR_EACH_VEC_ELT (datarefs, i, dr) 1720 { 1721 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1722 stmt_vec_info stmt_info = dr_info->stmt; 1723 1724 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1725 continue; 1726 1727 /* For interleaving, only the alignment of the first access 1728 matters. */ 1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1730 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1731 continue; 1732 1733 /* For scatter-gather or invariant accesses there is nothing 1734 to enhance. */ 1735 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) 1736 || integer_zerop (DR_STEP (dr))) 1737 continue; 1738 1739 /* Strided accesses perform only component accesses, alignment is 1740 irrelevant for them. */ 1741 if (STMT_VINFO_STRIDED_P (stmt_info) 1742 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1743 continue; 1744 1745 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true); 1746 do_peeling = vector_alignment_reachable_p (dr_info); 1747 if (do_peeling) 1748 { 1749 if (known_alignment_for_access_p (dr_info)) 1750 { 1751 unsigned int npeel_tmp = 0; 1752 bool negative = tree_int_cst_compare (DR_STEP (dr), 1753 size_zero_node) < 0; 1754 1755 vectype = STMT_VINFO_VECTYPE (stmt_info); 1756 /* If known_alignment_for_access_p then we have set 1757 DR_MISALIGNMENT which is only done if we know it at compiler 1758 time, so it is safe to assume target alignment is constant. 1759 */ 1760 unsigned int target_align = 1761 DR_TARGET_ALIGNMENT (dr_info).to_constant (); 1762 unsigned int dr_size = vect_get_scalar_dr_size (dr_info); 1763 mis = (negative 1764 ? DR_MISALIGNMENT (dr_info) 1765 : -DR_MISALIGNMENT (dr_info)); 1766 if (DR_MISALIGNMENT (dr_info) != 0) 1767 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1768 1769 /* For multiple types, it is possible that the bigger type access 1770 will have more than one peeling option. E.g., a loop with two 1771 types: one of size (vector size / 4), and the other one of 1772 size (vector size / 8). Vectorization factor will 8. If both 1773 accesses are misaligned by 3, the first one needs one scalar 1774 iteration to be aligned, and the second one needs 5. But the 1775 first one will be aligned also by peeling 5 scalar 1776 iterations, and in that case both accesses will be aligned. 1777 Hence, except for the immediate peeling amount, we also want 1778 to try to add full vector size, while we don't exceed 1779 vectorization factor. 1780 We do this automatically for cost model, since we calculate 1781 cost for every peeling option. */ 1782 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1783 { 1784 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1785 ? vf * DR_GROUP_SIZE (stmt_info) : vf); 1786 possible_npeel_number 1787 = vect_get_num_vectors (nscalars, vectype); 1788 1789 /* NPEEL_TMP is 0 when there is no misalignment, but also 1790 allow peeling NELEMENTS. */ 1791 if (DR_MISALIGNMENT (dr_info) == 0) 1792 possible_npeel_number++; 1793 } 1794 1795 /* Save info about DR in the hash table. Also include peeling 1796 amounts according to the explanation above. */ 1797 for (j = 0; j < possible_npeel_number; j++) 1798 { 1799 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1800 dr_info, npeel_tmp); 1801 npeel_tmp += target_align / dr_size; 1802 } 1803 1804 one_misalignment_known = true; 1805 } 1806 else 1807 { 1808 /* If we don't know any misalignment values, we prefer 1809 peeling for data-ref that has the maximum number of data-refs 1810 with the same alignment, unless the target prefers to align 1811 stores over load. */ 1812 unsigned same_align_drs 1813 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1814 if (!dr0_info 1815 || same_align_drs_max < same_align_drs) 1816 { 1817 same_align_drs_max = same_align_drs; 1818 dr0_info = dr_info; 1819 } 1820 /* For data-refs with the same number of related 1821 accesses prefer the one where the misalign 1822 computation will be invariant in the outermost loop. */ 1823 else if (same_align_drs_max == same_align_drs) 1824 { 1825 struct loop *ivloop0, *ivloop; 1826 ivloop0 = outermost_invariant_loop_for_expr 1827 (loop, DR_BASE_ADDRESS (dr0_info->dr)); 1828 ivloop = outermost_invariant_loop_for_expr 1829 (loop, DR_BASE_ADDRESS (dr)); 1830 if ((ivloop && !ivloop0) 1831 || (ivloop && ivloop0 1832 && flow_loop_nested_p (ivloop, ivloop0))) 1833 dr0_info = dr_info; 1834 } 1835 1836 one_misalignment_unknown = true; 1837 1838 /* Check for data refs with unsupportable alignment that 1839 can be peeled. */ 1840 if (!supportable_dr_alignment) 1841 { 1842 one_dr_unsupportable = true; 1843 unsupportable_dr_info = dr_info; 1844 } 1845 1846 if (!first_store && DR_IS_WRITE (dr)) 1847 first_store = dr_info; 1848 } 1849 } 1850 else 1851 { 1852 if (!aligned_access_p (dr_info)) 1853 { 1854 if (dump_enabled_p ()) 1855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1856 "vector alignment may not be reachable\n"); 1857 break; 1858 } 1859 } 1860 } 1861 1862 /* Check if we can possibly peel the loop. */ 1863 if (!vect_can_advance_ivs_p (loop_vinfo) 1864 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1865 || loop->inner) 1866 do_peeling = false; 1867 1868 struct _vect_peel_extended_info peel_for_known_alignment; 1869 struct _vect_peel_extended_info peel_for_unknown_alignment; 1870 struct _vect_peel_extended_info best_peel; 1871 1872 peel_for_unknown_alignment.inside_cost = INT_MAX; 1873 peel_for_unknown_alignment.outside_cost = INT_MAX; 1874 peel_for_unknown_alignment.peel_info.count = 0; 1875 1876 if (do_peeling 1877 && one_misalignment_unknown) 1878 { 1879 /* Check if the target requires to prefer stores over loads, i.e., if 1880 misaligned stores are more expensive than misaligned loads (taking 1881 drs with same alignment into account). */ 1882 unsigned int load_inside_cost = 0; 1883 unsigned int load_outside_cost = 0; 1884 unsigned int store_inside_cost = 0; 1885 unsigned int store_outside_cost = 0; 1886 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1887 1888 stmt_vector_for_cost dummy; 1889 dummy.create (2); 1890 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, 1891 &load_inside_cost, 1892 &load_outside_cost, 1893 &dummy, &dummy, estimated_npeels, true); 1894 dummy.release (); 1895 1896 if (first_store) 1897 { 1898 dummy.create (2); 1899 vect_get_peeling_costs_all_drs (loop_vinfo, first_store, 1900 &store_inside_cost, 1901 &store_outside_cost, 1902 &dummy, &dummy, 1903 estimated_npeels, true); 1904 dummy.release (); 1905 } 1906 else 1907 { 1908 store_inside_cost = INT_MAX; 1909 store_outside_cost = INT_MAX; 1910 } 1911 1912 if (load_inside_cost > store_inside_cost 1913 || (load_inside_cost == store_inside_cost 1914 && load_outside_cost > store_outside_cost)) 1915 { 1916 dr0_info = first_store; 1917 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1918 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1919 } 1920 else 1921 { 1922 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1923 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1924 } 1925 1926 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1927 prologue_cost_vec.create (2); 1928 epilogue_cost_vec.create (2); 1929 1930 int dummy2; 1931 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1932 (loop_vinfo, estimated_npeels, &dummy2, 1933 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1934 &prologue_cost_vec, &epilogue_cost_vec); 1935 1936 prologue_cost_vec.release (); 1937 epilogue_cost_vec.release (); 1938 1939 peel_for_unknown_alignment.peel_info.count = 1 1940 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length (); 1941 } 1942 1943 peel_for_unknown_alignment.peel_info.npeel = 0; 1944 peel_for_unknown_alignment.peel_info.dr_info = dr0_info; 1945 1946 best_peel = peel_for_unknown_alignment; 1947 1948 peel_for_known_alignment.inside_cost = INT_MAX; 1949 peel_for_known_alignment.outside_cost = INT_MAX; 1950 peel_for_known_alignment.peel_info.count = 0; 1951 peel_for_known_alignment.peel_info.dr_info = NULL; 1952 1953 if (do_peeling && one_misalignment_known) 1954 { 1955 /* Peeling is possible, but there is no data access that is not supported 1956 unless aligned. So we try to choose the best possible peeling from 1957 the hash table. */ 1958 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 1959 (&peeling_htab, loop_vinfo); 1960 } 1961 1962 /* Compare costs of peeling for known and unknown alignment. */ 1963 if (peel_for_known_alignment.peel_info.dr_info != NULL 1964 && peel_for_unknown_alignment.inside_cost 1965 >= peel_for_known_alignment.inside_cost) 1966 { 1967 best_peel = peel_for_known_alignment; 1968 1969 /* If the best peeling for known alignment has NPEEL == 0, perform no 1970 peeling at all except if there is an unsupportable dr that we can 1971 align. */ 1972 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 1973 do_peeling = false; 1974 } 1975 1976 /* If there is an unsupportable data ref, prefer this over all choices so far 1977 since we'd have to discard a chosen peeling except when it accidentally 1978 aligned the unsupportable data ref. */ 1979 if (one_dr_unsupportable) 1980 dr0_info = unsupportable_dr_info; 1981 else if (do_peeling) 1982 { 1983 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 1984 TODO: Use nopeel_outside_cost or get rid of it? */ 1985 unsigned nopeel_inside_cost = 0; 1986 unsigned nopeel_outside_cost = 0; 1987 1988 stmt_vector_for_cost dummy; 1989 dummy.create (2); 1990 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost, 1991 &nopeel_outside_cost, &dummy, &dummy, 1992 0, false); 1993 dummy.release (); 1994 1995 /* Add epilogue costs. As we do not peel for alignment here, no prologue 1996 costs will be recorded. */ 1997 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1998 prologue_cost_vec.create (2); 1999 epilogue_cost_vec.create (2); 2000 2001 int dummy2; 2002 nopeel_outside_cost += vect_get_known_peeling_cost 2003 (loop_vinfo, 0, &dummy2, 2004 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 2005 &prologue_cost_vec, &epilogue_cost_vec); 2006 2007 prologue_cost_vec.release (); 2008 epilogue_cost_vec.release (); 2009 2010 npeel = best_peel.peel_info.npeel; 2011 dr0_info = best_peel.peel_info.dr_info; 2012 2013 /* If no peeling is not more expensive than the best peeling we 2014 have so far, don't perform any peeling. */ 2015 if (nopeel_inside_cost <= best_peel.inside_cost) 2016 do_peeling = false; 2017 } 2018 2019 if (do_peeling) 2020 { 2021 stmt_vec_info stmt_info = dr0_info->stmt; 2022 vectype = STMT_VINFO_VECTYPE (stmt_info); 2023 2024 if (known_alignment_for_access_p (dr0_info)) 2025 { 2026 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), 2027 size_zero_node) < 0; 2028 if (!npeel) 2029 { 2030 /* Since it's known at compile time, compute the number of 2031 iterations in the peeled loop (the peeling factor) for use in 2032 updating DR_MISALIGNMENT values. The peeling factor is the 2033 vectorization factor minus the misalignment as an element 2034 count. */ 2035 mis = (negative 2036 ? DR_MISALIGNMENT (dr0_info) 2037 : -DR_MISALIGNMENT (dr0_info)); 2038 /* If known_alignment_for_access_p then we have set 2039 DR_MISALIGNMENT which is only done if we know it at compiler 2040 time, so it is safe to assume target alignment is constant. 2041 */ 2042 unsigned int target_align = 2043 DR_TARGET_ALIGNMENT (dr0_info).to_constant (); 2044 npeel = ((mis & (target_align - 1)) 2045 / vect_get_scalar_dr_size (dr0_info)); 2046 } 2047 2048 /* For interleaved data access every iteration accesses all the 2049 members of the group, therefore we divide the number of iterations 2050 by the group size. */ 2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2052 npeel /= DR_GROUP_SIZE (stmt_info); 2053 2054 if (dump_enabled_p ()) 2055 dump_printf_loc (MSG_NOTE, vect_location, 2056 "Try peeling by %d\n", npeel); 2057 } 2058 2059 /* Ensure that all datarefs can be vectorized after the peel. */ 2060 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) 2061 do_peeling = false; 2062 2063 /* Check if all datarefs are supportable and log. */ 2064 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0) 2065 { 2066 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2067 if (!stat) 2068 do_peeling = false; 2069 else 2070 return stat; 2071 } 2072 2073 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2074 if (do_peeling) 2075 { 2076 unsigned max_allowed_peel 2077 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 2078 if (max_allowed_peel != (unsigned)-1) 2079 { 2080 unsigned max_peel = npeel; 2081 if (max_peel == 0) 2082 { 2083 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); 2084 unsigned HOST_WIDE_INT target_align_c; 2085 if (target_align.is_constant (&target_align_c)) 2086 max_peel = 2087 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1; 2088 else 2089 { 2090 do_peeling = false; 2091 if (dump_enabled_p ()) 2092 dump_printf_loc (MSG_NOTE, vect_location, 2093 "Disable peeling, max peels set and vector" 2094 " alignment unknown\n"); 2095 } 2096 } 2097 if (max_peel > max_allowed_peel) 2098 { 2099 do_peeling = false; 2100 if (dump_enabled_p ()) 2101 dump_printf_loc (MSG_NOTE, vect_location, 2102 "Disable peeling, max peels reached: %d\n", max_peel); 2103 } 2104 } 2105 } 2106 2107 /* Cost model #2 - if peeling may result in a remaining loop not 2108 iterating enough to be vectorized then do not peel. Since this 2109 is a cost heuristic rather than a correctness decision, use the 2110 most likely runtime value for variable vectorization factors. */ 2111 if (do_peeling 2112 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2113 { 2114 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2115 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2116 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2117 < assumed_vf + max_peel) 2118 do_peeling = false; 2119 } 2120 2121 if (do_peeling) 2122 { 2123 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2124 If the misalignment of DR_i is identical to that of dr0 then set 2125 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2126 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2127 by the peeling factor times the element size of DR_i (MOD the 2128 vectorization factor times the size). Otherwise, the 2129 misalignment of DR_i must be set to unknown. */ 2130 FOR_EACH_VEC_ELT (datarefs, i, dr) 2131 if (dr != dr0_info->dr) 2132 { 2133 /* Strided accesses perform only component accesses, alignment 2134 is irrelevant for them. */ 2135 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 2136 stmt_info = dr_info->stmt; 2137 if (STMT_VINFO_STRIDED_P (stmt_info) 2138 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2139 continue; 2140 2141 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 2142 } 2143 2144 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; 2145 if (npeel) 2146 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2147 else 2148 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2149 = DR_MISALIGNMENT (dr0_info); 2150 SET_DR_MISALIGNMENT (dr0_info, 0); 2151 if (dump_enabled_p ()) 2152 { 2153 dump_printf_loc (MSG_NOTE, vect_location, 2154 "Alignment of access forced using peeling.\n"); 2155 dump_printf_loc (MSG_NOTE, vect_location, 2156 "Peeling for alignment will be applied.\n"); 2157 } 2158 2159 /* The inside-loop cost will be accounted for in vectorizable_load 2160 and vectorizable_store correctly with adjusted alignments. 2161 Drop the body_cst_vec on the floor here. */ 2162 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2163 gcc_assert (stat); 2164 return stat; 2165 } 2166 } 2167 2168 /* (2) Versioning to force alignment. */ 2169 2170 /* Try versioning if: 2171 1) optimize loop for speed 2172 2) there is at least one unsupported misaligned data ref with an unknown 2173 misalignment, and 2174 3) all misaligned data refs with a known misalignment are supported, and 2175 4) the number of runtime alignment checks is within reason. */ 2176 2177 do_versioning = 2178 optimize_loop_nest_for_speed_p (loop) 2179 && (!loop->inner); /* FORNOW */ 2180 2181 if (do_versioning) 2182 { 2183 FOR_EACH_VEC_ELT (datarefs, i, dr) 2184 { 2185 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 2186 stmt_vec_info stmt_info = dr_info->stmt; 2187 2188 /* For interleaving, only the alignment of the first access 2189 matters. */ 2190 if (aligned_access_p (dr_info) 2191 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2192 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)) 2193 continue; 2194 2195 if (STMT_VINFO_STRIDED_P (stmt_info)) 2196 { 2197 /* Strided loads perform only component accesses, alignment is 2198 irrelevant for them. */ 2199 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2200 continue; 2201 do_versioning = false; 2202 break; 2203 } 2204 2205 supportable_dr_alignment 2206 = vect_supportable_dr_alignment (dr_info, false); 2207 2208 if (!supportable_dr_alignment) 2209 { 2210 int mask; 2211 tree vectype; 2212 2213 if (known_alignment_for_access_p (dr_info) 2214 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2215 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2216 { 2217 do_versioning = false; 2218 break; 2219 } 2220 2221 vectype = STMT_VINFO_VECTYPE (stmt_info); 2222 gcc_assert (vectype); 2223 2224 /* At present we don't support versioning for alignment 2225 with variable VF, since there's no guarantee that the 2226 VF is a power of two. We could relax this if we added 2227 a way of enforcing a power-of-two size. */ 2228 unsigned HOST_WIDE_INT size; 2229 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2230 { 2231 do_versioning = false; 2232 break; 2233 } 2234 2235 /* Forcing alignment in the first iteration is no good if 2236 we don't keep it across iterations. For now, just disable 2237 versioning in this case. 2238 ?? We could actually unroll the loop to achieve the required 2239 overall step alignment, and forcing the alignment could be 2240 done by doing some iterations of the non-vectorized loop. */ 2241 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 2242 * DR_STEP_ALIGNMENT (dr), 2243 DR_TARGET_ALIGNMENT (dr_info))) 2244 { 2245 do_versioning = false; 2246 break; 2247 } 2248 2249 /* The rightmost bits of an aligned address must be zeros. 2250 Construct the mask needed for this test. For example, 2251 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2252 mask must be 15 = 0xf. */ 2253 mask = size - 1; 2254 2255 /* FORNOW: use the same mask to test all potentially unaligned 2256 references in the loop. The vectorizer currently supports 2257 a single vector size, see the reference to 2258 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2259 vectorization factor is computed. */ 2260 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2261 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2262 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2263 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info); 2264 } 2265 } 2266 2267 /* Versioning requires at least one misaligned data reference. */ 2268 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2269 do_versioning = false; 2270 else if (!do_versioning) 2271 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2272 } 2273 2274 if (do_versioning) 2275 { 2276 vec<stmt_vec_info> may_misalign_stmts 2277 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2278 stmt_vec_info stmt_info; 2279 2280 /* It can now be assumed that the data references in the statements 2281 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2282 of the loop being vectorized. */ 2283 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) 2284 { 2285 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 2286 SET_DR_MISALIGNMENT (dr_info, 0); 2287 if (dump_enabled_p ()) 2288 dump_printf_loc (MSG_NOTE, vect_location, 2289 "Alignment of access forced using versioning.\n"); 2290 } 2291 2292 if (dump_enabled_p ()) 2293 dump_printf_loc (MSG_NOTE, vect_location, 2294 "Versioning for alignment will be applied.\n"); 2295 2296 /* Peeling and versioning can't be done together at this time. */ 2297 gcc_assert (! (do_peeling && do_versioning)); 2298 2299 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2300 gcc_assert (stat); 2301 return stat; 2302 } 2303 2304 /* This point is reached if neither peeling nor versioning is being done. */ 2305 gcc_assert (! (do_peeling || do_versioning)); 2306 2307 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2308 return stat; 2309 } 2310 2311 2312 /* Function vect_find_same_alignment_drs. 2313 2314 Update group and alignment relations in VINFO according to the chosen 2315 vectorization factor. */ 2316 2317 static void 2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr) 2319 { 2320 struct data_reference *dra = DDR_A (ddr); 2321 struct data_reference *drb = DDR_B (ddr); 2322 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 2323 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 2324 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 2325 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 2326 2327 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2328 return; 2329 2330 if (dra == drb) 2331 return; 2332 2333 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 2334 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2335 return; 2336 2337 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2338 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2339 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2340 return; 2341 2342 /* Two references with distance zero have the same alignment. */ 2343 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2344 - wi::to_poly_offset (DR_INIT (drb))); 2345 if (maybe_ne (diff, 0)) 2346 { 2347 /* Get the wider of the two alignments. */ 2348 poly_uint64 align_a = 2349 exact_div (vect_calculate_target_alignment (dr_info_a), 2350 BITS_PER_UNIT); 2351 poly_uint64 align_b = 2352 exact_div (vect_calculate_target_alignment (dr_info_b), 2353 BITS_PER_UNIT); 2354 unsigned HOST_WIDE_INT align_a_c, align_b_c; 2355 if (!align_a.is_constant (&align_a_c) 2356 || !align_b.is_constant (&align_b_c)) 2357 return; 2358 2359 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c); 2360 2361 /* Require the gap to be a multiple of the larger vector alignment. */ 2362 if (!multiple_p (diff, max_align)) 2363 return; 2364 } 2365 2366 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2367 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2368 if (dump_enabled_p ()) 2369 dump_printf_loc (MSG_NOTE, vect_location, 2370 "accesses have the same alignment: %T and %T\n", 2371 DR_REF (dra), DR_REF (drb)); 2372 } 2373 2374 2375 /* Function vect_analyze_data_refs_alignment 2376 2377 Analyze the alignment of the data-references in the loop. 2378 Return FALSE if a data reference is found that cannot be vectorized. */ 2379 2380 opt_result 2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2382 { 2383 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment"); 2384 2385 /* Mark groups of data references with same alignment using 2386 data dependence information. */ 2387 vec<ddr_p> ddrs = vinfo->shared->ddrs; 2388 struct data_dependence_relation *ddr; 2389 unsigned int i; 2390 2391 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2392 vect_find_same_alignment_drs (vinfo, ddr); 2393 2394 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 2395 struct data_reference *dr; 2396 2397 vect_record_base_alignments (vinfo); 2398 FOR_EACH_VEC_ELT (datarefs, i, dr) 2399 { 2400 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 2401 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) 2402 vect_compute_data_ref_alignment (dr_info); 2403 } 2404 2405 return opt_result::success (); 2406 } 2407 2408 2409 /* Analyze alignment of DRs of stmts in NODE. */ 2410 2411 static bool 2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2413 { 2414 /* We vectorize from the first scalar stmt in the node unless 2415 the node is permuted in which case we start from the first 2416 element in the group. */ 2417 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 2418 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); 2419 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2420 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); 2421 2422 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); 2423 vect_compute_data_ref_alignment (dr_info); 2424 /* For creating the data-ref pointer we need alignment of the 2425 first element anyway. */ 2426 if (dr_info != first_dr_info) 2427 vect_compute_data_ref_alignment (first_dr_info); 2428 if (! verify_data_ref_alignment (dr_info)) 2429 { 2430 if (dump_enabled_p ()) 2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2432 "not vectorized: bad data alignment in basic " 2433 "block.\n"); 2434 return false; 2435 } 2436 2437 return true; 2438 } 2439 2440 /* Function vect_slp_analyze_instance_alignment 2441 2442 Analyze the alignment of the data-references in the SLP instance. 2443 Return FALSE if a data reference is found that cannot be vectorized. */ 2444 2445 bool 2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2447 { 2448 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment"); 2449 2450 slp_tree node; 2451 unsigned i; 2452 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2453 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2454 return false; 2455 2456 node = SLP_INSTANCE_TREE (instance); 2457 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0]) 2458 && ! vect_slp_analyze_and_verify_node_alignment 2459 (SLP_INSTANCE_TREE (instance))) 2460 return false; 2461 2462 return true; 2463 } 2464 2465 2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of 2467 accesses of legal size, step, etc. Detect gaps, single element 2468 interleaving, and other special cases. Set grouped access info. 2469 Collect groups of strided stores for further use in SLP analysis. 2470 Worker for vect_analyze_group_access. */ 2471 2472 static bool 2473 vect_analyze_group_access_1 (dr_vec_info *dr_info) 2474 { 2475 data_reference *dr = dr_info->dr; 2476 tree step = DR_STEP (dr); 2477 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2478 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2479 stmt_vec_info stmt_info = dr_info->stmt; 2480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2481 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2482 HOST_WIDE_INT dr_step = -1; 2483 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2484 bool slp_impossible = false; 2485 2486 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2487 size of the interleaving group (including gaps). */ 2488 if (tree_fits_shwi_p (step)) 2489 { 2490 dr_step = tree_to_shwi (step); 2491 /* Check that STEP is a multiple of type size. Otherwise there is 2492 a non-element-sized gap at the end of the group which we 2493 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. 2494 ??? As we can handle non-constant step fine here we should 2495 simply remove uses of DR_GROUP_GAP between the last and first 2496 element and instead rely on DR_STEP. DR_GROUP_SIZE then would 2497 simply not include that gap. */ 2498 if ((dr_step % type_size) != 0) 2499 { 2500 if (dump_enabled_p ()) 2501 dump_printf_loc (MSG_NOTE, vect_location, 2502 "Step %T is not a multiple of the element size" 2503 " for %T\n", 2504 step, DR_REF (dr)); 2505 return false; 2506 } 2507 groupsize = absu_hwi (dr_step) / type_size; 2508 } 2509 else 2510 groupsize = 0; 2511 2512 /* Not consecutive access is possible only if it is a part of interleaving. */ 2513 if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) 2514 { 2515 /* Check if it this DR is a part of interleaving, and is a single 2516 element of the group that is accessed in the loop. */ 2517 2518 /* Gaps are supported only for loads. STEP must be a multiple of the type 2519 size. */ 2520 if (DR_IS_READ (dr) 2521 && (dr_step % type_size) == 0 2522 && groupsize > 0) 2523 { 2524 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; 2525 DR_GROUP_SIZE (stmt_info) = groupsize; 2526 DR_GROUP_GAP (stmt_info) = groupsize - 1; 2527 if (dump_enabled_p ()) 2528 dump_printf_loc (MSG_NOTE, vect_location, 2529 "Detected single element interleaving %T" 2530 " step %T\n", 2531 DR_REF (dr), step); 2532 2533 return true; 2534 } 2535 2536 if (dump_enabled_p ()) 2537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2538 "not consecutive access %G", stmt_info->stmt); 2539 2540 if (bb_vinfo) 2541 { 2542 /* Mark the statement as unvectorizable. */ 2543 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 2544 return true; 2545 } 2546 2547 if (dump_enabled_p ()) 2548 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2549 STMT_VINFO_STRIDED_P (stmt_info) = true; 2550 return true; 2551 } 2552 2553 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) 2554 { 2555 /* First stmt in the interleaving chain. Check the chain. */ 2556 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2557 struct data_reference *data_ref = dr; 2558 unsigned int count = 1; 2559 tree prev_init = DR_INIT (data_ref); 2560 HOST_WIDE_INT diff, gaps = 0; 2561 2562 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2563 while (next) 2564 { 2565 /* We never have the same DR multiple times. */ 2566 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), 2567 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); 2568 2569 data_ref = STMT_VINFO_DATA_REF (next); 2570 2571 /* All group members have the same STEP by construction. */ 2572 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2573 2574 /* Check that the distance between two accesses is equal to the type 2575 size. Otherwise, we have gaps. */ 2576 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2577 - TREE_INT_CST_LOW (prev_init)) / type_size; 2578 if (diff != 1) 2579 { 2580 /* FORNOW: SLP of accesses with gaps is not supported. */ 2581 slp_impossible = true; 2582 if (DR_IS_WRITE (data_ref)) 2583 { 2584 if (dump_enabled_p ()) 2585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2586 "interleaved store with gaps\n"); 2587 return false; 2588 } 2589 2590 gaps += diff - 1; 2591 } 2592 2593 last_accessed_element += diff; 2594 2595 /* Store the gap from the previous member of the group. If there is no 2596 gap in the access, DR_GROUP_GAP is always 1. */ 2597 DR_GROUP_GAP (next) = diff; 2598 2599 prev_init = DR_INIT (data_ref); 2600 next = DR_GROUP_NEXT_ELEMENT (next); 2601 /* Count the number of data-refs in the chain. */ 2602 count++; 2603 } 2604 2605 if (groupsize == 0) 2606 groupsize = count + gaps; 2607 2608 /* This could be UINT_MAX but as we are generating code in a very 2609 inefficient way we have to cap earlier. See PR78699 for example. */ 2610 if (groupsize > 4096) 2611 { 2612 if (dump_enabled_p ()) 2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2614 "group is too large\n"); 2615 return false; 2616 } 2617 2618 /* Check that the size of the interleaving is equal to count for stores, 2619 i.e., that there are no gaps. */ 2620 if (groupsize != count 2621 && !DR_IS_READ (dr)) 2622 { 2623 groupsize = count; 2624 STMT_VINFO_STRIDED_P (stmt_info) = true; 2625 } 2626 2627 /* If there is a gap after the last load in the group it is the 2628 difference between the groupsize and the last accessed 2629 element. 2630 When there is no gap, this difference should be 0. */ 2631 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; 2632 2633 DR_GROUP_SIZE (stmt_info) = groupsize; 2634 if (dump_enabled_p ()) 2635 { 2636 dump_printf_loc (MSG_NOTE, vect_location, 2637 "Detected interleaving "); 2638 if (DR_IS_READ (dr)) 2639 dump_printf (MSG_NOTE, "load "); 2640 else if (STMT_VINFO_STRIDED_P (stmt_info)) 2641 dump_printf (MSG_NOTE, "strided store "); 2642 else 2643 dump_printf (MSG_NOTE, "store "); 2644 dump_printf (MSG_NOTE, "of size %u\n", 2645 (unsigned)groupsize); 2646 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt); 2647 next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2648 while (next) 2649 { 2650 if (DR_GROUP_GAP (next) != 1) 2651 dump_printf_loc (MSG_NOTE, vect_location, 2652 "\t<gap of %d elements>\n", 2653 DR_GROUP_GAP (next) - 1); 2654 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt); 2655 next = DR_GROUP_NEXT_ELEMENT (next); 2656 } 2657 if (DR_GROUP_GAP (stmt_info) != 0) 2658 dump_printf_loc (MSG_NOTE, vect_location, 2659 "\t<gap of %d elements>\n", 2660 DR_GROUP_GAP (stmt_info)); 2661 } 2662 2663 /* SLP: create an SLP data structure for every interleaving group of 2664 stores for further analysis in vect_analyse_slp. */ 2665 if (DR_IS_WRITE (dr) && !slp_impossible) 2666 { 2667 if (loop_vinfo) 2668 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info); 2669 if (bb_vinfo) 2670 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); 2671 } 2672 } 2673 2674 return true; 2675 } 2676 2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of 2678 accesses of legal size, step, etc. Detect gaps, single element 2679 interleaving, and other special cases. Set grouped access info. 2680 Collect groups of strided stores for further use in SLP analysis. */ 2681 2682 static bool 2683 vect_analyze_group_access (dr_vec_info *dr_info) 2684 { 2685 if (!vect_analyze_group_access_1 (dr_info)) 2686 { 2687 /* Dissolve the group if present. */ 2688 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); 2689 while (stmt_info) 2690 { 2691 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2692 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2693 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; 2694 stmt_info = next; 2695 } 2696 return false; 2697 } 2698 return true; 2699 } 2700 2701 /* Analyze the access pattern of the data-reference DR_INFO. 2702 In case of non-consecutive accesses call vect_analyze_group_access() to 2703 analyze groups of accesses. */ 2704 2705 static bool 2706 vect_analyze_data_ref_access (dr_vec_info *dr_info) 2707 { 2708 data_reference *dr = dr_info->dr; 2709 tree step = DR_STEP (dr); 2710 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2711 stmt_vec_info stmt_info = dr_info->stmt; 2712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2713 struct loop *loop = NULL; 2714 2715 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2716 return true; 2717 2718 if (loop_vinfo) 2719 loop = LOOP_VINFO_LOOP (loop_vinfo); 2720 2721 if (loop_vinfo && !step) 2722 { 2723 if (dump_enabled_p ()) 2724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2725 "bad data-ref access in loop\n"); 2726 return false; 2727 } 2728 2729 /* Allow loads with zero step in inner-loop vectorization. */ 2730 if (loop_vinfo && integer_zerop (step)) 2731 { 2732 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2733 if (!nested_in_vect_loop_p (loop, stmt_info)) 2734 return DR_IS_READ (dr); 2735 /* Allow references with zero step for outer loops marked 2736 with pragma omp simd only - it guarantees absence of 2737 loop-carried dependencies between inner loop iterations. */ 2738 if (loop->safelen < 2) 2739 { 2740 if (dump_enabled_p ()) 2741 dump_printf_loc (MSG_NOTE, vect_location, 2742 "zero step in inner loop of nest\n"); 2743 return false; 2744 } 2745 } 2746 2747 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 2748 { 2749 /* Interleaved accesses are not yet supported within outer-loop 2750 vectorization for references in the inner-loop. */ 2751 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2752 2753 /* For the rest of the analysis we use the outer-loop step. */ 2754 step = STMT_VINFO_DR_STEP (stmt_info); 2755 if (integer_zerop (step)) 2756 { 2757 if (dump_enabled_p ()) 2758 dump_printf_loc (MSG_NOTE, vect_location, 2759 "zero step in outer loop.\n"); 2760 return DR_IS_READ (dr); 2761 } 2762 } 2763 2764 /* Consecutive? */ 2765 if (TREE_CODE (step) == INTEGER_CST) 2766 { 2767 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2768 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2769 || (dr_step < 0 2770 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2771 { 2772 /* Mark that it is not interleaving. */ 2773 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2774 return true; 2775 } 2776 } 2777 2778 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 2779 { 2780 if (dump_enabled_p ()) 2781 dump_printf_loc (MSG_NOTE, vect_location, 2782 "grouped access in outer loop.\n"); 2783 return false; 2784 } 2785 2786 2787 /* Assume this is a DR handled by non-constant strided load case. */ 2788 if (TREE_CODE (step) != INTEGER_CST) 2789 return (STMT_VINFO_STRIDED_P (stmt_info) 2790 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2791 || vect_analyze_group_access (dr_info))); 2792 2793 /* Not consecutive access - check if it's a part of interleaving group. */ 2794 return vect_analyze_group_access (dr_info); 2795 } 2796 2797 /* Compare two data-references DRA and DRB to group them into chunks 2798 suitable for grouping. */ 2799 2800 static int 2801 dr_group_sort_cmp (const void *dra_, const void *drb_) 2802 { 2803 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2804 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2805 int cmp; 2806 2807 /* Stabilize sort. */ 2808 if (dra == drb) 2809 return 0; 2810 2811 /* DRs in different loops never belong to the same group. */ 2812 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2813 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2814 if (loopa != loopb) 2815 return loopa->num < loopb->num ? -1 : 1; 2816 2817 /* Ordering of DRs according to base. */ 2818 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2819 DR_BASE_ADDRESS (drb)); 2820 if (cmp != 0) 2821 return cmp; 2822 2823 /* And according to DR_OFFSET. */ 2824 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2825 if (cmp != 0) 2826 return cmp; 2827 2828 /* Put reads before writes. */ 2829 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2830 return DR_IS_READ (dra) ? -1 : 1; 2831 2832 /* Then sort after access size. */ 2833 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2834 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2835 if (cmp != 0) 2836 return cmp; 2837 2838 /* And after step. */ 2839 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2840 if (cmp != 0) 2841 return cmp; 2842 2843 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2844 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2845 if (cmp == 0) 2846 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2847 return cmp; 2848 } 2849 2850 /* If OP is the result of a conversion, return the unconverted value, 2851 otherwise return null. */ 2852 2853 static tree 2854 strip_conversion (tree op) 2855 { 2856 if (TREE_CODE (op) != SSA_NAME) 2857 return NULL_TREE; 2858 gimple *stmt = SSA_NAME_DEF_STMT (op); 2859 if (!is_gimple_assign (stmt) 2860 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2861 return NULL_TREE; 2862 return gimple_assign_rhs1 (stmt); 2863 } 2864 2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO 2866 and STMT2_INFO being in a single group. */ 2867 2868 static bool 2869 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) 2870 { 2871 if (gimple_assign_single_p (stmt1_info->stmt)) 2872 return gimple_assign_single_p (stmt2_info->stmt); 2873 2874 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt); 2875 if (call1 && gimple_call_internal_p (call1)) 2876 { 2877 /* Check for two masked loads or two masked stores. */ 2878 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt); 2879 if (!call2 || !gimple_call_internal_p (call2)) 2880 return false; 2881 internal_fn ifn = gimple_call_internal_fn (call1); 2882 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2883 return false; 2884 if (ifn != gimple_call_internal_fn (call2)) 2885 return false; 2886 2887 /* Check that the masks are the same. Cope with casts of masks, 2888 like those created by build_mask_conversion. */ 2889 tree mask1 = gimple_call_arg (call1, 2); 2890 tree mask2 = gimple_call_arg (call2, 2); 2891 if (!operand_equal_p (mask1, mask2, 0)) 2892 { 2893 mask1 = strip_conversion (mask1); 2894 if (!mask1) 2895 return false; 2896 mask2 = strip_conversion (mask2); 2897 if (!mask2) 2898 return false; 2899 if (!operand_equal_p (mask1, mask2, 0)) 2900 return false; 2901 } 2902 return true; 2903 } 2904 2905 return false; 2906 } 2907 2908 /* Function vect_analyze_data_ref_accesses. 2909 2910 Analyze the access pattern of all the data references in the loop. 2911 2912 FORNOW: the only access pattern that is considered vectorizable is a 2913 simple step 1 (consecutive) access. 2914 2915 FORNOW: handle only arrays and pointer accesses. */ 2916 2917 opt_result 2918 vect_analyze_data_ref_accesses (vec_info *vinfo) 2919 { 2920 unsigned int i; 2921 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 2922 struct data_reference *dr; 2923 2924 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses"); 2925 2926 if (datarefs.is_empty ()) 2927 return opt_result::success (); 2928 2929 /* Sort the array of datarefs to make building the interleaving chains 2930 linear. Don't modify the original vector's order, it is needed for 2931 determining what dependencies are reversed. */ 2932 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2933 datarefs_copy.qsort (dr_group_sort_cmp); 2934 hash_set<stmt_vec_info> to_fixup; 2935 2936 /* Build the interleaving chains. */ 2937 for (i = 0; i < datarefs_copy.length () - 1;) 2938 { 2939 data_reference_p dra = datarefs_copy[i]; 2940 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 2941 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 2942 stmt_vec_info lastinfo = NULL; 2943 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 2944 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 2945 { 2946 ++i; 2947 continue; 2948 } 2949 for (i = i + 1; i < datarefs_copy.length (); ++i) 2950 { 2951 data_reference_p drb = datarefs_copy[i]; 2952 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 2953 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 2954 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 2955 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2956 break; 2957 2958 /* ??? Imperfect sorting (non-compatible types, non-modulo 2959 accesses, same accesses) can lead to a group to be artificially 2960 split here as we don't just skip over those. If it really 2961 matters we can push those to a worklist and re-iterate 2962 over them. The we can just skip ahead to the next DR here. */ 2963 2964 /* DRs in a different loop should not be put into the same 2965 interleaving group. */ 2966 if (gimple_bb (DR_STMT (dra))->loop_father 2967 != gimple_bb (DR_STMT (drb))->loop_father) 2968 break; 2969 2970 /* Check that the data-refs have same first location (except init) 2971 and they are both either store or load (not load and store, 2972 not masked loads or stores). */ 2973 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2974 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2975 DR_BASE_ADDRESS (drb)) != 0 2976 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 2977 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b)) 2978 break; 2979 2980 /* Check that the data-refs have the same constant size. */ 2981 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2982 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2983 if (!tree_fits_uhwi_p (sza) 2984 || !tree_fits_uhwi_p (szb) 2985 || !tree_int_cst_equal (sza, szb)) 2986 break; 2987 2988 /* Check that the data-refs have the same step. */ 2989 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 2990 break; 2991 2992 /* Check the types are compatible. 2993 ??? We don't distinguish this during sorting. */ 2994 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2995 TREE_TYPE (DR_REF (drb)))) 2996 break; 2997 2998 /* Check that the DR_INITs are compile-time constants. */ 2999 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 3000 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 3001 break; 3002 3003 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 3004 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 3005 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 3006 HOST_WIDE_INT init_prev 3007 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 3008 gcc_assert (init_a <= init_b 3009 && init_a <= init_prev 3010 && init_prev <= init_b); 3011 3012 /* Do not place the same access in the interleaving chain twice. */ 3013 if (init_b == init_prev) 3014 { 3015 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3016 < gimple_uid (DR_STMT (drb))); 3017 /* Simply link in duplicates and fix up the chain below. */ 3018 } 3019 else 3020 { 3021 /* If init_b == init_a + the size of the type * k, we have an 3022 interleaving, and DRA is accessed before DRB. */ 3023 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3024 if (type_size_a == 0 3025 || (init_b - init_a) % type_size_a != 0) 3026 break; 3027 3028 /* If we have a store, the accesses are adjacent. This splits 3029 groups into chunks we support (we don't support vectorization 3030 of stores with gaps). */ 3031 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3032 break; 3033 3034 /* If the step (if not zero or non-constant) is greater than the 3035 difference between data-refs' inits this splits groups into 3036 suitable sizes. */ 3037 if (tree_fits_shwi_p (DR_STEP (dra))) 3038 { 3039 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3040 if (step != 0 && step <= (init_b - init_a)) 3041 break; 3042 } 3043 } 3044 3045 if (dump_enabled_p ()) 3046 dump_printf_loc (MSG_NOTE, vect_location, 3047 DR_IS_READ (dra) 3048 ? "Detected interleaving load %T and %T\n" 3049 : "Detected interleaving store %T and %T\n", 3050 DR_REF (dra), DR_REF (drb)); 3051 3052 /* Link the found element into the group list. */ 3053 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) 3054 { 3055 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; 3056 lastinfo = stmtinfo_a; 3057 } 3058 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; 3059 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; 3060 lastinfo = stmtinfo_b; 3061 3062 if (init_b == init_prev 3063 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) 3064 && dump_enabled_p ()) 3065 dump_printf_loc (MSG_NOTE, vect_location, 3066 "Queuing group with duplicate access for fixup\n"); 3067 } 3068 } 3069 3070 /* Fixup groups with duplicate entries by splitting it. */ 3071 while (1) 3072 { 3073 hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); 3074 if (!(it != to_fixup.end ())) 3075 break; 3076 stmt_vec_info grp = *it; 3077 to_fixup.remove (grp); 3078 3079 /* Find the earliest duplicate group member. */ 3080 unsigned first_duplicate = -1u; 3081 stmt_vec_info next, g = grp; 3082 while ((next = DR_GROUP_NEXT_ELEMENT (g))) 3083 { 3084 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), 3085 DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) 3086 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) 3087 first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); 3088 g = next; 3089 } 3090 if (first_duplicate == -1U) 3091 continue; 3092 3093 /* Then move all stmts after the first duplicate to a new group. 3094 Note this is a heuristic but one with the property that *it 3095 is fixed up completely. */ 3096 g = grp; 3097 stmt_vec_info newgroup = NULL, ng = grp; 3098 while ((next = DR_GROUP_NEXT_ELEMENT (g))) 3099 { 3100 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) 3101 { 3102 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); 3103 if (!newgroup) 3104 newgroup = next; 3105 else 3106 DR_GROUP_NEXT_ELEMENT (ng) = next; 3107 ng = next; 3108 DR_GROUP_FIRST_ELEMENT (ng) = newgroup; 3109 } 3110 else 3111 g = DR_GROUP_NEXT_ELEMENT (g); 3112 } 3113 DR_GROUP_NEXT_ELEMENT (ng) = NULL; 3114 3115 /* Fixup the new group which still may contain duplicates. */ 3116 to_fixup.add (newgroup); 3117 } 3118 3119 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3120 { 3121 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 3122 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) 3123 && !vect_analyze_data_ref_access (dr_info)) 3124 { 3125 if (dump_enabled_p ()) 3126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3127 "not vectorized: complicated access pattern.\n"); 3128 3129 if (is_a <bb_vec_info> (vinfo)) 3130 { 3131 /* Mark the statement as not vectorizable. */ 3132 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; 3133 continue; 3134 } 3135 else 3136 { 3137 datarefs_copy.release (); 3138 return opt_result::failure_at (dr_info->stmt->stmt, 3139 "not vectorized:" 3140 " complicated access pattern.\n"); 3141 } 3142 } 3143 } 3144 3145 datarefs_copy.release (); 3146 return opt_result::success (); 3147 } 3148 3149 /* Function vect_vfa_segment_size. 3150 3151 Input: 3152 DR_INFO: The data reference. 3153 LENGTH_FACTOR: segment length to consider. 3154 3155 Return a value suitable for the dr_with_seg_len::seg_len field. 3156 This is the "distance travelled" by the pointer from the first 3157 iteration in the segment to the last. Note that it does not include 3158 the size of the access; in effect it only describes the first byte. */ 3159 3160 static tree 3161 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) 3162 { 3163 length_factor = size_binop (MINUS_EXPR, 3164 fold_convert (sizetype, length_factor), 3165 size_one_node); 3166 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), 3167 length_factor); 3168 } 3169 3170 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), 3171 gives the worst-case number of bytes covered by the segment. */ 3172 3173 static unsigned HOST_WIDE_INT 3174 vect_vfa_access_size (dr_vec_info *dr_info) 3175 { 3176 stmt_vec_info stmt_vinfo = dr_info->stmt; 3177 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); 3178 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3179 unsigned HOST_WIDE_INT access_size = ref_size; 3180 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) 3181 { 3182 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); 3183 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); 3184 } 3185 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3186 && (vect_supportable_dr_alignment (dr_info, false) 3187 == dr_explicit_realign_optimized)) 3188 { 3189 /* We might access a full vector's worth. */ 3190 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3191 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3192 } 3193 return access_size; 3194 } 3195 3196 /* Get the minimum alignment for all the scalar accesses that DR_INFO 3197 describes. */ 3198 3199 static unsigned int 3200 vect_vfa_align (dr_vec_info *dr_info) 3201 { 3202 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr))); 3203 } 3204 3205 /* Function vect_no_alias_p. 3206 3207 Given data references A and B with equal base and offset, see whether 3208 the alias relation can be decided at compilation time. Return 1 if 3209 it can and the references alias, 0 if it can and the references do 3210 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3211 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3212 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3213 3214 static int 3215 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, 3216 tree segment_length_a, tree segment_length_b, 3217 unsigned HOST_WIDE_INT access_size_a, 3218 unsigned HOST_WIDE_INT access_size_b) 3219 { 3220 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); 3221 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); 3222 poly_uint64 const_length_a; 3223 poly_uint64 const_length_b; 3224 3225 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3226 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3227 [a, a+12) */ 3228 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) 3229 { 3230 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3231 offset_a -= const_length_a; 3232 } 3233 else 3234 const_length_a = tree_to_poly_uint64 (segment_length_a); 3235 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) 3236 { 3237 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3238 offset_b -= const_length_b; 3239 } 3240 else 3241 const_length_b = tree_to_poly_uint64 (segment_length_b); 3242 3243 const_length_a += access_size_a; 3244 const_length_b += access_size_b; 3245 3246 if (ranges_known_overlap_p (offset_a, const_length_a, 3247 offset_b, const_length_b)) 3248 return 1; 3249 3250 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3251 offset_b, const_length_b)) 3252 return 0; 3253 3254 return -1; 3255 } 3256 3257 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3258 in DDR is >= VF. */ 3259 3260 static bool 3261 dependence_distance_ge_vf (data_dependence_relation *ddr, 3262 unsigned int loop_depth, poly_uint64 vf) 3263 { 3264 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3265 || DDR_NUM_DIST_VECTS (ddr) == 0) 3266 return false; 3267 3268 /* If the dependence is exact, we should have limited the VF instead. */ 3269 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3270 3271 unsigned int i; 3272 lambda_vector dist_v; 3273 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3274 { 3275 HOST_WIDE_INT dist = dist_v[loop_depth]; 3276 if (dist != 0 3277 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3278 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3279 return false; 3280 } 3281 3282 if (dump_enabled_p ()) 3283 dump_printf_loc (MSG_NOTE, vect_location, 3284 "dependence distance between %T and %T is >= VF\n", 3285 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); 3286 3287 return true; 3288 } 3289 3290 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3291 3292 static void 3293 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) 3294 { 3295 dump_printf (dump_kind, "%s (%T) >= ", 3296 lower_bound.unsigned_p ? "unsigned" : "abs", 3297 lower_bound.expr); 3298 dump_dec (dump_kind, lower_bound.min_value); 3299 } 3300 3301 /* Record that the vectorized loop requires the vec_lower_bound described 3302 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3303 3304 static void 3305 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3306 poly_uint64 min_value) 3307 { 3308 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3309 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3310 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3311 { 3312 unsigned_p &= lower_bounds[i].unsigned_p; 3313 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3314 if (lower_bounds[i].unsigned_p != unsigned_p 3315 || maybe_lt (lower_bounds[i].min_value, min_value)) 3316 { 3317 lower_bounds[i].unsigned_p = unsigned_p; 3318 lower_bounds[i].min_value = min_value; 3319 if (dump_enabled_p ()) 3320 { 3321 dump_printf_loc (MSG_NOTE, vect_location, 3322 "updating run-time check to "); 3323 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3324 dump_printf (MSG_NOTE, "\n"); 3325 } 3326 } 3327 return; 3328 } 3329 3330 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3331 if (dump_enabled_p ()) 3332 { 3333 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3334 dump_lower_bound (MSG_NOTE, lower_bound); 3335 dump_printf (MSG_NOTE, "\n"); 3336 } 3337 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3338 } 3339 3340 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO 3341 will span fewer than GAP bytes. */ 3342 3343 static bool 3344 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, 3345 poly_int64 gap) 3346 { 3347 stmt_vec_info stmt_info = dr_info->stmt; 3348 HOST_WIDE_INT count 3349 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3350 if (DR_GROUP_FIRST_ELEMENT (stmt_info)) 3351 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); 3352 return (estimated_poly_value (gap) 3353 <= count * vect_get_scalar_dr_size (dr_info)); 3354 } 3355 3356 /* Return true if we know that there is no alias between DR_INFO_A and 3357 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. 3358 When returning true, set *LOWER_BOUND_OUT to this N. */ 3359 3360 static bool 3361 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, 3362 poly_uint64 *lower_bound_out) 3363 { 3364 /* Check that there is a constant gap of known sign between DR_A 3365 and DR_B. */ 3366 data_reference *dr_a = dr_info_a->dr; 3367 data_reference *dr_b = dr_info_b->dr; 3368 poly_int64 init_a, init_b; 3369 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3370 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3371 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3372 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3373 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3374 || !ordered_p (init_a, init_b)) 3375 return false; 3376 3377 /* Sort DR_A and DR_B by the address they access. */ 3378 if (maybe_lt (init_b, init_a)) 3379 { 3380 std::swap (init_a, init_b); 3381 std::swap (dr_info_a, dr_info_b); 3382 std::swap (dr_a, dr_b); 3383 } 3384 3385 /* If the two accesses could be dependent within a scalar iteration, 3386 make sure that we'd retain their order. */ 3387 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) 3388 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) 3389 return false; 3390 3391 /* There is no alias if abs (DR_STEP) is greater than or equal to 3392 the bytes spanned by the combination of the two accesses. */ 3393 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a; 3394 return true; 3395 } 3396 3397 /* Function vect_prune_runtime_alias_test_list. 3398 3399 Prune a list of ddrs to be tested at run-time by versioning for alias. 3400 Merge several alias checks into one if possible. 3401 Return FALSE if resulting list of ddrs is longer then allowed by 3402 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3403 3404 opt_result 3405 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3406 { 3407 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3408 hash_set <tree_pair_hash> compared_objects; 3409 3410 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3411 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3412 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3413 vec<vec_object_pair> &check_unequal_addrs 3414 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3415 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3416 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3417 3418 ddr_p ddr; 3419 unsigned int i; 3420 tree length_factor; 3421 3422 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list"); 3423 3424 /* Step values are irrelevant for aliasing if the number of vector 3425 iterations is equal to the number of scalar iterations (which can 3426 happen for fully-SLP loops). */ 3427 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3428 3429 if (!ignore_step_p) 3430 { 3431 /* Convert the checks for nonzero steps into bound tests. */ 3432 tree value; 3433 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3434 vect_check_lower_bound (loop_vinfo, value, true, 1); 3435 } 3436 3437 if (may_alias_ddrs.is_empty ()) 3438 return opt_result::success (); 3439 3440 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3441 3442 unsigned int loop_depth 3443 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3444 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3445 3446 /* First, we collect all data ref pairs for aliasing checks. */ 3447 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3448 { 3449 int comp_res; 3450 poly_uint64 lower_bound; 3451 tree segment_length_a, segment_length_b; 3452 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3453 unsigned int align_a, align_b; 3454 3455 /* Ignore the alias if the VF we chose ended up being no greater 3456 than the dependence distance. */ 3457 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3458 continue; 3459 3460 if (DDR_OBJECT_A (ddr)) 3461 { 3462 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3463 if (!compared_objects.add (new_pair)) 3464 { 3465 if (dump_enabled_p ()) 3466 dump_printf_loc (MSG_NOTE, vect_location, 3467 "checking that %T and %T" 3468 " have different addresses\n", 3469 new_pair.first, new_pair.second); 3470 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3471 } 3472 continue; 3473 } 3474 3475 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); 3476 stmt_vec_info stmt_info_a = dr_info_a->stmt; 3477 3478 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); 3479 stmt_vec_info stmt_info_b = dr_info_b->stmt; 3480 3481 /* Skip the pair if inter-iteration dependencies are irrelevant 3482 and intra-iteration dependencies are guaranteed to be honored. */ 3483 if (ignore_step_p 3484 && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b) 3485 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, 3486 &lower_bound))) 3487 { 3488 if (dump_enabled_p ()) 3489 dump_printf_loc (MSG_NOTE, vect_location, 3490 "no need for alias check between " 3491 "%T and %T when VF is 1\n", 3492 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); 3493 continue; 3494 } 3495 3496 /* See whether we can handle the alias using a bounds check on 3497 the step, and whether that's likely to be the best approach. 3498 (It might not be, for example, if the minimum step is much larger 3499 than the number of bytes handled by one vector iteration.) */ 3500 if (!ignore_step_p 3501 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST 3502 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, 3503 &lower_bound) 3504 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound) 3505 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound))) 3506 { 3507 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); 3508 if (dump_enabled_p ()) 3509 { 3510 dump_printf_loc (MSG_NOTE, vect_location, "no alias between " 3511 "%T and %T when the step %T is outside ", 3512 DR_REF (dr_info_a->dr), 3513 DR_REF (dr_info_b->dr), 3514 DR_STEP (dr_info_a->dr)); 3515 if (unsigned_p) 3516 dump_printf (MSG_NOTE, "[0"); 3517 else 3518 { 3519 dump_printf (MSG_NOTE, "("); 3520 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3521 } 3522 dump_printf (MSG_NOTE, ", "); 3523 dump_dec (MSG_NOTE, lower_bound); 3524 dump_printf (MSG_NOTE, ")\n"); 3525 } 3526 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), 3527 unsigned_p, lower_bound); 3528 continue; 3529 } 3530 3531 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); 3532 if (dr_group_first_a) 3533 { 3534 stmt_info_a = dr_group_first_a; 3535 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); 3536 } 3537 3538 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); 3539 if (dr_group_first_b) 3540 { 3541 stmt_info_b = dr_group_first_b; 3542 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); 3543 } 3544 3545 if (ignore_step_p) 3546 { 3547 segment_length_a = size_zero_node; 3548 segment_length_b = size_zero_node; 3549 } 3550 else 3551 { 3552 if (!operand_equal_p (DR_STEP (dr_info_a->dr), 3553 DR_STEP (dr_info_b->dr), 0)) 3554 length_factor = scalar_loop_iters; 3555 else 3556 length_factor = size_int (vect_factor); 3557 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); 3558 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); 3559 } 3560 access_size_a = vect_vfa_access_size (dr_info_a); 3561 access_size_b = vect_vfa_access_size (dr_info_b); 3562 align_a = vect_vfa_align (dr_info_a); 3563 align_b = vect_vfa_align (dr_info_b); 3564 3565 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr), 3566 DR_BASE_ADDRESS (dr_info_b->dr)); 3567 if (comp_res == 0) 3568 comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr), 3569 DR_OFFSET (dr_info_b->dr)); 3570 3571 /* See whether the alias is known at compilation time. */ 3572 if (comp_res == 0 3573 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST 3574 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST 3575 && poly_int_tree_p (segment_length_a) 3576 && poly_int_tree_p (segment_length_b)) 3577 { 3578 int res = vect_compile_time_alias (dr_info_a, dr_info_b, 3579 segment_length_a, 3580 segment_length_b, 3581 access_size_a, 3582 access_size_b); 3583 if (res >= 0 && dump_enabled_p ()) 3584 { 3585 dump_printf_loc (MSG_NOTE, vect_location, 3586 "can tell at compile time that %T and %T", 3587 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); 3588 if (res == 0) 3589 dump_printf (MSG_NOTE, " do not alias\n"); 3590 else 3591 dump_printf (MSG_NOTE, " alias\n"); 3592 } 3593 3594 if (res == 0) 3595 continue; 3596 3597 if (res == 1) 3598 return opt_result::failure_at (stmt_info_b->stmt, 3599 "not vectorized:" 3600 " compilation time alias: %G%G", 3601 stmt_info_a->stmt, 3602 stmt_info_b->stmt); 3603 } 3604 3605 dr_with_seg_len_pair_t dr_with_seg_len_pair 3606 (dr_with_seg_len (dr_info_a->dr, segment_length_a, 3607 access_size_a, align_a), 3608 dr_with_seg_len (dr_info_b->dr, segment_length_b, 3609 access_size_b, align_b)); 3610 3611 /* Canonicalize pairs by sorting the two DR members. */ 3612 if (comp_res > 0) 3613 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3614 3615 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3616 } 3617 3618 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3619 3620 unsigned int count = (comp_alias_ddrs.length () 3621 + check_unequal_addrs.length ()); 3622 3623 if (dump_enabled_p ()) 3624 dump_printf_loc (MSG_NOTE, vect_location, 3625 "improved number of alias checks from %d to %d\n", 3626 may_alias_ddrs.length (), count); 3627 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3628 return opt_result::failure_at 3629 (vect_location, 3630 "number of versioning for alias " 3631 "run-time tests exceeds %d " 3632 "(--param vect-max-version-for-alias-checks)\n", 3633 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3634 3635 return opt_result::success (); 3636 } 3637 3638 /* Check whether we can use an internal function for a gather load 3639 or scatter store. READ_P is true for loads and false for stores. 3640 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3641 the type of the memory elements being loaded or stored. OFFSET_BITS 3642 is the number of bits in each scalar offset and OFFSET_SIGN is the 3643 sign of the offset. SCALE is the amount by which the offset should 3644 be multiplied *after* it has been converted to address width. 3645 3646 Return true if the function is supported, storing the function 3647 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ 3648 3649 bool 3650 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, 3651 tree memory_type, unsigned int offset_bits, 3652 signop offset_sign, int scale, 3653 internal_fn *ifn_out, tree *element_type_out) 3654 { 3655 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3656 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3657 if (offset_bits > element_bits) 3658 /* Internal functions require the offset to be the same width as 3659 the vector elements. We can extend narrower offsets, but it isn't 3660 safe to truncate wider offsets. */ 3661 return false; 3662 3663 if (element_bits != memory_bits) 3664 /* For now the vector elements must be the same width as the 3665 memory elements. */ 3666 return false; 3667 3668 /* Work out which function we need. */ 3669 internal_fn ifn; 3670 if (read_p) 3671 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3672 else 3673 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3674 3675 /* Test whether the target supports this combination. */ 3676 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3677 offset_sign, scale)) 3678 return false; 3679 3680 *ifn_out = ifn; 3681 *element_type_out = TREE_TYPE (vectype); 3682 return true; 3683 } 3684 3685 /* STMT_INFO is a call to an internal gather load or scatter store function. 3686 Describe the operation in INFO. */ 3687 3688 static void 3689 vect_describe_gather_scatter_call (stmt_vec_info stmt_info, 3690 gather_scatter_info *info) 3691 { 3692 gcall *call = as_a <gcall *> (stmt_info->stmt); 3693 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3694 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3695 3696 info->ifn = gimple_call_internal_fn (call); 3697 info->decl = NULL_TREE; 3698 info->base = gimple_call_arg (call, 0); 3699 info->offset = gimple_call_arg (call, 1); 3700 info->offset_dt = vect_unknown_def_type; 3701 info->offset_vectype = NULL_TREE; 3702 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3703 info->element_type = TREE_TYPE (vectype); 3704 info->memory_type = TREE_TYPE (DR_REF (dr)); 3705 } 3706 3707 /* Return true if a non-affine read or write in STMT_INFO is suitable for a 3708 gather load or scatter store. Describe the operation in *INFO if so. */ 3709 3710 bool 3711 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, 3712 gather_scatter_info *info) 3713 { 3714 HOST_WIDE_INT scale = 1; 3715 poly_int64 pbitpos, pbitsize; 3716 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3717 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3718 tree offtype = NULL_TREE; 3719 tree decl = NULL_TREE, base, off; 3720 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3721 tree memory_type = TREE_TYPE (DR_REF (dr)); 3722 machine_mode pmode; 3723 int punsignedp, reversep, pvolatilep = 0; 3724 internal_fn ifn; 3725 tree element_type; 3726 bool masked_p = false; 3727 3728 /* See whether this is already a call to a gather/scatter internal function. 3729 If not, see whether it's a masked load or store. */ 3730 gcall *call = dyn_cast <gcall *> (stmt_info->stmt); 3731 if (call && gimple_call_internal_p (call)) 3732 { 3733 ifn = gimple_call_internal_fn (call); 3734 if (internal_gather_scatter_fn_p (ifn)) 3735 { 3736 vect_describe_gather_scatter_call (stmt_info, info); 3737 return true; 3738 } 3739 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3740 } 3741 3742 /* True if we should aim to use internal functions rather than 3743 built-in functions. */ 3744 bool use_ifn_p = (DR_IS_READ (dr) 3745 ? supports_vec_gather_load_p () 3746 : supports_vec_scatter_store_p ()); 3747 3748 base = DR_REF (dr); 3749 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3750 see if we can use the def stmt of the address. */ 3751 if (masked_p 3752 && TREE_CODE (base) == MEM_REF 3753 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3754 && integer_zerop (TREE_OPERAND (base, 1)) 3755 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3756 { 3757 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3758 if (is_gimple_assign (def_stmt) 3759 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3760 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3761 } 3762 3763 /* The gather and scatter builtins need address of the form 3764 loop_invariant + vector * {1, 2, 4, 8} 3765 or 3766 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3767 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3768 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3769 multiplications and additions in it. To get a vector, we need 3770 a single SSA_NAME that will be defined in the loop and will 3771 contain everything that is not loop invariant and that can be 3772 vectorized. The following code attempts to find such a preexistng 3773 SSA_NAME OFF and put the loop invariants into a tree BASE 3774 that can be gimplified before the loop. */ 3775 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3776 &punsignedp, &reversep, &pvolatilep); 3777 if (reversep) 3778 return false; 3779 3780 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3781 3782 if (TREE_CODE (base) == MEM_REF) 3783 { 3784 if (!integer_zerop (TREE_OPERAND (base, 1))) 3785 { 3786 if (off == NULL_TREE) 3787 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3788 else 3789 off = size_binop (PLUS_EXPR, off, 3790 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3791 } 3792 base = TREE_OPERAND (base, 0); 3793 } 3794 else 3795 base = build_fold_addr_expr (base); 3796 3797 if (off == NULL_TREE) 3798 off = size_zero_node; 3799 3800 /* If base is not loop invariant, either off is 0, then we start with just 3801 the constant offset in the loop invariant BASE and continue with base 3802 as OFF, otherwise give up. 3803 We could handle that case by gimplifying the addition of base + off 3804 into some SSA_NAME and use that as off, but for now punt. */ 3805 if (!expr_invariant_in_loop_p (loop, base)) 3806 { 3807 if (!integer_zerop (off)) 3808 return false; 3809 off = base; 3810 base = size_int (pbytepos); 3811 } 3812 /* Otherwise put base + constant offset into the loop invariant BASE 3813 and continue with OFF. */ 3814 else 3815 { 3816 base = fold_convert (sizetype, base); 3817 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3818 } 3819 3820 /* OFF at this point may be either a SSA_NAME or some tree expression 3821 from get_inner_reference. Try to peel off loop invariants from it 3822 into BASE as long as possible. */ 3823 STRIP_NOPS (off); 3824 while (offtype == NULL_TREE) 3825 { 3826 enum tree_code code; 3827 tree op0, op1, add = NULL_TREE; 3828 3829 if (TREE_CODE (off) == SSA_NAME) 3830 { 3831 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3832 3833 if (expr_invariant_in_loop_p (loop, off)) 3834 return false; 3835 3836 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3837 break; 3838 3839 op0 = gimple_assign_rhs1 (def_stmt); 3840 code = gimple_assign_rhs_code (def_stmt); 3841 op1 = gimple_assign_rhs2 (def_stmt); 3842 } 3843 else 3844 { 3845 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3846 return false; 3847 code = TREE_CODE (off); 3848 extract_ops_from_tree (off, &code, &op0, &op1); 3849 } 3850 switch (code) 3851 { 3852 case POINTER_PLUS_EXPR: 3853 case PLUS_EXPR: 3854 if (expr_invariant_in_loop_p (loop, op0)) 3855 { 3856 add = op0; 3857 off = op1; 3858 do_add: 3859 add = fold_convert (sizetype, add); 3860 if (scale != 1) 3861 add = size_binop (MULT_EXPR, add, size_int (scale)); 3862 base = size_binop (PLUS_EXPR, base, add); 3863 continue; 3864 } 3865 if (expr_invariant_in_loop_p (loop, op1)) 3866 { 3867 add = op1; 3868 off = op0; 3869 goto do_add; 3870 } 3871 break; 3872 case MINUS_EXPR: 3873 if (expr_invariant_in_loop_p (loop, op1)) 3874 { 3875 add = fold_convert (sizetype, op1); 3876 add = size_binop (MINUS_EXPR, size_zero_node, add); 3877 off = op0; 3878 goto do_add; 3879 } 3880 break; 3881 case MULT_EXPR: 3882 if (scale == 1 && tree_fits_shwi_p (op1)) 3883 { 3884 int new_scale = tree_to_shwi (op1); 3885 /* Only treat this as a scaling operation if the target 3886 supports it. */ 3887 if (use_ifn_p 3888 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, 3889 vectype, memory_type, 1, 3890 TYPE_SIGN (TREE_TYPE (op0)), 3891 new_scale, &ifn, 3892 &element_type)) 3893 break; 3894 scale = new_scale; 3895 off = op0; 3896 continue; 3897 } 3898 break; 3899 case SSA_NAME: 3900 off = op0; 3901 continue; 3902 CASE_CONVERT: 3903 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3904 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3905 break; 3906 if (TYPE_PRECISION (TREE_TYPE (op0)) 3907 == TYPE_PRECISION (TREE_TYPE (off))) 3908 { 3909 off = op0; 3910 continue; 3911 } 3912 3913 /* The internal functions need the offset to be the same width 3914 as the elements of VECTYPE. Don't include operations that 3915 cast the offset from that width to a different width. */ 3916 if (use_ifn_p 3917 && (int_size_in_bytes (TREE_TYPE (vectype)) 3918 == int_size_in_bytes (TREE_TYPE (off)))) 3919 break; 3920 3921 if (TYPE_PRECISION (TREE_TYPE (op0)) 3922 < TYPE_PRECISION (TREE_TYPE (off))) 3923 { 3924 off = op0; 3925 offtype = TREE_TYPE (off); 3926 STRIP_NOPS (off); 3927 continue; 3928 } 3929 break; 3930 default: 3931 break; 3932 } 3933 break; 3934 } 3935 3936 /* If at the end OFF still isn't a SSA_NAME or isn't 3937 defined in the loop, punt. */ 3938 if (TREE_CODE (off) != SSA_NAME 3939 || expr_invariant_in_loop_p (loop, off)) 3940 return false; 3941 3942 if (offtype == NULL_TREE) 3943 offtype = TREE_TYPE (off); 3944 3945 if (use_ifn_p) 3946 { 3947 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, 3948 memory_type, TYPE_PRECISION (offtype), 3949 TYPE_SIGN (offtype), scale, &ifn, 3950 &element_type)) 3951 return false; 3952 } 3953 else 3954 { 3955 if (DR_IS_READ (dr)) 3956 { 3957 if (targetm.vectorize.builtin_gather) 3958 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 3959 } 3960 else 3961 { 3962 if (targetm.vectorize.builtin_scatter) 3963 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 3964 } 3965 3966 if (!decl) 3967 return false; 3968 3969 ifn = IFN_LAST; 3970 element_type = TREE_TYPE (vectype); 3971 } 3972 3973 info->ifn = ifn; 3974 info->decl = decl; 3975 info->base = base; 3976 info->offset = off; 3977 info->offset_dt = vect_unknown_def_type; 3978 info->offset_vectype = NULL_TREE; 3979 info->scale = scale; 3980 info->element_type = element_type; 3981 info->memory_type = memory_type; 3982 return true; 3983 } 3984 3985 /* Find the data references in STMT, analyze them with respect to LOOP and 3986 append them to DATAREFS. Return false if datarefs in this stmt cannot 3987 be handled. */ 3988 3989 opt_result 3990 vect_find_stmt_data_reference (loop_p loop, gimple *stmt, 3991 vec<data_reference_p> *datarefs) 3992 { 3993 /* We can ignore clobbers for dataref analysis - they are removed during 3994 loop vectorization and BB vectorization checks dependences with a 3995 stmt walk. */ 3996 if (gimple_clobber_p (stmt)) 3997 return opt_result::success (); 3998 3999 if (gimple_has_volatile_ops (stmt)) 4000 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G", 4001 stmt); 4002 4003 if (stmt_can_throw_internal (cfun, stmt)) 4004 return opt_result::failure_at (stmt, 4005 "not vectorized:" 4006 " statement can throw an exception: %G", 4007 stmt); 4008 4009 auto_vec<data_reference_p, 2> refs; 4010 opt_result res = find_data_references_in_stmt (loop, stmt, &refs); 4011 if (!res) 4012 return res; 4013 4014 if (refs.is_empty ()) 4015 return opt_result::success (); 4016 4017 if (refs.length () > 1) 4018 return opt_result::failure_at (stmt, 4019 "not vectorized:" 4020 " more than one data ref in stmt: %G", stmt); 4021 4022 if (gcall *call = dyn_cast <gcall *> (stmt)) 4023 if (!gimple_call_internal_p (call) 4024 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD 4025 && gimple_call_internal_fn (call) != IFN_MASK_STORE)) 4026 return opt_result::failure_at (stmt, 4027 "not vectorized: dr in a call %G", stmt); 4028 4029 data_reference_p dr = refs.pop (); 4030 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4031 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4032 return opt_result::failure_at (stmt, 4033 "not vectorized:" 4034 " statement is bitfield access %G", stmt); 4035 4036 if (DR_BASE_ADDRESS (dr) 4037 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4038 return opt_result::failure_at (stmt, 4039 "not vectorized:" 4040 " base addr of dr is a constant\n"); 4041 4042 /* Check whether this may be a SIMD lane access and adjust the 4043 DR to make it easier for us to handle it. */ 4044 if (loop 4045 && loop->simduid 4046 && (!DR_BASE_ADDRESS (dr) 4047 || !DR_OFFSET (dr) 4048 || !DR_INIT (dr) 4049 || !DR_STEP (dr))) 4050 { 4051 struct data_reference *newdr 4052 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, 4053 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); 4054 if (DR_BASE_ADDRESS (newdr) 4055 && DR_OFFSET (newdr) 4056 && DR_INIT (newdr) 4057 && DR_STEP (newdr) 4058 && integer_zerop (DR_STEP (newdr))) 4059 { 4060 tree off = DR_OFFSET (newdr); 4061 STRIP_NOPS (off); 4062 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4063 && TREE_CODE (off) == MULT_EXPR 4064 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4065 { 4066 tree step = TREE_OPERAND (off, 1); 4067 off = TREE_OPERAND (off, 0); 4068 STRIP_NOPS (off); 4069 if (CONVERT_EXPR_P (off) 4070 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) 4071 < TYPE_PRECISION (TREE_TYPE (off)))) 4072 off = TREE_OPERAND (off, 0); 4073 if (TREE_CODE (off) == SSA_NAME) 4074 { 4075 gimple *def = SSA_NAME_DEF_STMT (off); 4076 tree reft = TREE_TYPE (DR_REF (newdr)); 4077 if (is_gimple_call (def) 4078 && gimple_call_internal_p (def) 4079 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE)) 4080 { 4081 tree arg = gimple_call_arg (def, 0); 4082 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4083 arg = SSA_NAME_VAR (arg); 4084 if (arg == loop->simduid 4085 /* For now. */ 4086 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) 4087 { 4088 DR_OFFSET (newdr) = ssize_int (0); 4089 DR_STEP (newdr) = step; 4090 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; 4091 DR_STEP_ALIGNMENT (newdr) 4092 = highest_pow2_factor (step); 4093 /* Mark as simd-lane access. */ 4094 newdr->aux = (void *)-1; 4095 free_data_ref (dr); 4096 datarefs->safe_push (newdr); 4097 return opt_result::success (); 4098 } 4099 } 4100 } 4101 } 4102 } 4103 free_data_ref (newdr); 4104 } 4105 4106 datarefs->safe_push (dr); 4107 return opt_result::success (); 4108 } 4109 4110 /* Function vect_analyze_data_refs. 4111 4112 Find all the data references in the loop or basic block. 4113 4114 The general structure of the analysis of data refs in the vectorizer is as 4115 follows: 4116 1- vect_analyze_data_refs(loop/bb): call 4117 compute_data_dependences_for_loop/bb to find and analyze all data-refs 4118 in the loop/bb and their dependences. 4119 2- vect_analyze_dependences(): apply dependence testing using ddrs. 4120 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 4121 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 4122 4123 */ 4124 4125 opt_result 4126 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) 4127 { 4128 struct loop *loop = NULL; 4129 unsigned int i; 4130 struct data_reference *dr; 4131 tree scalar_type; 4132 4133 DUMP_VECT_SCOPE ("vect_analyze_data_refs"); 4134 4135 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 4136 loop = LOOP_VINFO_LOOP (loop_vinfo); 4137 4138 /* Go through the data-refs, check that the analysis succeeded. Update 4139 pointer from stmt_vec_info struct to DR and vectype. */ 4140 4141 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 4142 FOR_EACH_VEC_ELT (datarefs, i, dr) 4143 { 4144 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 4145 poly_uint64 vf; 4146 4147 gcc_assert (DR_REF (dr)); 4148 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); 4149 gcc_assert (!stmt_info->dr_aux.dr); 4150 stmt_info->dr_aux.dr = dr; 4151 stmt_info->dr_aux.stmt = stmt_info; 4152 4153 /* Check that analysis of the data-ref succeeded. */ 4154 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4155 || !DR_STEP (dr)) 4156 { 4157 bool maybe_gather 4158 = DR_IS_READ (dr) 4159 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4160 && (targetm.vectorize.builtin_gather != NULL 4161 || supports_vec_gather_load_p ()); 4162 bool maybe_scatter 4163 = DR_IS_WRITE (dr) 4164 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4165 && (targetm.vectorize.builtin_scatter != NULL 4166 || supports_vec_scatter_store_p ()); 4167 4168 /* If target supports vector gather loads or scatter stores, 4169 see if they can't be used. */ 4170 if (is_a <loop_vec_info> (vinfo) 4171 && !nested_in_vect_loop_p (loop, stmt_info)) 4172 { 4173 if (maybe_gather || maybe_scatter) 4174 { 4175 if (maybe_gather) 4176 gatherscatter = GATHER; 4177 else 4178 gatherscatter = SCATTER; 4179 } 4180 } 4181 4182 if (gatherscatter == SG_NONE) 4183 { 4184 if (dump_enabled_p ()) 4185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4186 "not vectorized: data ref analysis " 4187 "failed %G", stmt_info->stmt); 4188 if (is_a <bb_vec_info> (vinfo)) 4189 { 4190 /* In BB vectorization the ref can still participate 4191 in dependence analysis, we just can't vectorize it. */ 4192 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4193 continue; 4194 } 4195 return opt_result::failure_at (stmt_info->stmt, 4196 "not vectorized:" 4197 " data ref analysis failed: %G", 4198 stmt_info->stmt); 4199 } 4200 } 4201 4202 /* See if this was detected as SIMD lane access. */ 4203 if (dr->aux == (void *)-1) 4204 { 4205 if (nested_in_vect_loop_p (loop, stmt_info)) 4206 return opt_result::failure_at (stmt_info->stmt, 4207 "not vectorized:" 4208 " data ref analysis failed: %G", 4209 stmt_info->stmt); 4210 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4211 } 4212 4213 tree base = get_base_address (DR_REF (dr)); 4214 if (base && VAR_P (base) && DECL_NONALIASED (base)) 4215 { 4216 if (dump_enabled_p ()) 4217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4218 "not vectorized: base object not addressable " 4219 "for stmt: %G", stmt_info->stmt); 4220 if (is_a <bb_vec_info> (vinfo)) 4221 { 4222 /* In BB vectorization the ref can still participate 4223 in dependence analysis, we just can't vectorize it. */ 4224 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4225 continue; 4226 } 4227 return opt_result::failure_at (stmt_info->stmt, 4228 "not vectorized: base object not" 4229 " addressable for stmt: %G", 4230 stmt_info->stmt); 4231 } 4232 4233 if (is_a <loop_vec_info> (vinfo) 4234 && DR_STEP (dr) 4235 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4236 { 4237 if (nested_in_vect_loop_p (loop, stmt_info)) 4238 return opt_result::failure_at (stmt_info->stmt, 4239 "not vectorized:" 4240 "not suitable for strided load %G", 4241 stmt_info->stmt); 4242 STMT_VINFO_STRIDED_P (stmt_info) = true; 4243 } 4244 4245 /* Update DR field in stmt_vec_info struct. */ 4246 4247 /* If the dataref is in an inner-loop of the loop that is considered for 4248 for vectorization, we also want to analyze the access relative to 4249 the outer-loop (DR contains information only relative to the 4250 inner-most enclosing loop). We do that by building a reference to the 4251 first location accessed by the inner-loop, and analyze it relative to 4252 the outer-loop. */ 4253 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 4254 { 4255 /* Build a reference to the first location accessed by the 4256 inner loop: *(BASE + INIT + OFFSET). By construction, 4257 this address must be invariant in the inner loop, so we 4258 can consider it as being used in the outer loop. */ 4259 tree base = unshare_expr (DR_BASE_ADDRESS (dr)); 4260 tree offset = unshare_expr (DR_OFFSET (dr)); 4261 tree init = unshare_expr (DR_INIT (dr)); 4262 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4263 init, offset); 4264 tree init_addr = fold_build_pointer_plus (base, init_offset); 4265 tree init_ref = build_fold_indirect_ref (init_addr); 4266 4267 if (dump_enabled_p ()) 4268 dump_printf_loc (MSG_NOTE, vect_location, 4269 "analyze in outer loop: %T\n", init_ref); 4270 4271 opt_result res 4272 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4273 init_ref, loop, stmt_info->stmt); 4274 if (!res) 4275 /* dr_analyze_innermost already explained the failure. */ 4276 return res; 4277 4278 if (dump_enabled_p ()) 4279 dump_printf_loc (MSG_NOTE, vect_location, 4280 "\touter base_address: %T\n" 4281 "\touter offset from base address: %T\n" 4282 "\touter constant offset from base address: %T\n" 4283 "\touter step: %T\n" 4284 "\touter base alignment: %d\n\n" 4285 "\touter base misalignment: %d\n" 4286 "\touter offset alignment: %d\n" 4287 "\touter step alignment: %d\n", 4288 STMT_VINFO_DR_BASE_ADDRESS (stmt_info), 4289 STMT_VINFO_DR_OFFSET (stmt_info), 4290 STMT_VINFO_DR_INIT (stmt_info), 4291 STMT_VINFO_DR_STEP (stmt_info), 4292 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), 4293 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), 4294 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), 4295 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4296 } 4297 4298 /* Set vectype for STMT. */ 4299 scalar_type = TREE_TYPE (DR_REF (dr)); 4300 STMT_VINFO_VECTYPE (stmt_info) 4301 = get_vectype_for_scalar_type (scalar_type); 4302 if (!STMT_VINFO_VECTYPE (stmt_info)) 4303 { 4304 if (dump_enabled_p ()) 4305 { 4306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4307 "not vectorized: no vectype for stmt: %G", 4308 stmt_info->stmt); 4309 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4310 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4311 scalar_type); 4312 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4313 } 4314 4315 if (is_a <bb_vec_info> (vinfo)) 4316 { 4317 /* No vector type is fine, the ref can still participate 4318 in dependence analysis, we just can't vectorize it. */ 4319 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4320 continue; 4321 } 4322 return opt_result::failure_at (stmt_info->stmt, 4323 "not vectorized:" 4324 " no vectype for stmt: %G" 4325 " scalar_type: %T\n", 4326 stmt_info->stmt, scalar_type); 4327 } 4328 else 4329 { 4330 if (dump_enabled_p ()) 4331 dump_printf_loc (MSG_NOTE, vect_location, 4332 "got vectype for stmt: %G%T\n", 4333 stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info)); 4334 } 4335 4336 /* Adjust the minimal vectorization factor according to the 4337 vector type. */ 4338 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4339 *min_vf = upper_bound (*min_vf, vf); 4340 4341 if (gatherscatter != SG_NONE) 4342 { 4343 gather_scatter_info gs_info; 4344 if (!vect_check_gather_scatter (stmt_info, 4345 as_a <loop_vec_info> (vinfo), 4346 &gs_info) 4347 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4348 return opt_result::failure_at 4349 (stmt_info->stmt, 4350 (gatherscatter == GATHER) ? 4351 "not vectorized: not suitable for gather load %G" : 4352 "not vectorized: not suitable for scatter store %G", 4353 stmt_info->stmt); 4354 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4355 } 4356 } 4357 4358 /* We used to stop processing and prune the list here. Verify we no 4359 longer need to. */ 4360 gcc_assert (i == datarefs.length ()); 4361 4362 return opt_result::success (); 4363 } 4364 4365 4366 /* Function vect_get_new_vect_var. 4367 4368 Returns a name for a new variable. The current naming scheme appends the 4369 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4370 the name of vectorizer generated variables, and appends that to NAME if 4371 provided. */ 4372 4373 tree 4374 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4375 { 4376 const char *prefix; 4377 tree new_vect_var; 4378 4379 switch (var_kind) 4380 { 4381 case vect_simple_var: 4382 prefix = "vect"; 4383 break; 4384 case vect_scalar_var: 4385 prefix = "stmp"; 4386 break; 4387 case vect_mask_var: 4388 prefix = "mask"; 4389 break; 4390 case vect_pointer_var: 4391 prefix = "vectp"; 4392 break; 4393 default: 4394 gcc_unreachable (); 4395 } 4396 4397 if (name) 4398 { 4399 char* tmp = concat (prefix, "_", name, NULL); 4400 new_vect_var = create_tmp_reg (type, tmp); 4401 free (tmp); 4402 } 4403 else 4404 new_vect_var = create_tmp_reg (type, prefix); 4405 4406 return new_vect_var; 4407 } 4408 4409 /* Like vect_get_new_vect_var but return an SSA name. */ 4410 4411 tree 4412 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4413 { 4414 const char *prefix; 4415 tree new_vect_var; 4416 4417 switch (var_kind) 4418 { 4419 case vect_simple_var: 4420 prefix = "vect"; 4421 break; 4422 case vect_scalar_var: 4423 prefix = "stmp"; 4424 break; 4425 case vect_pointer_var: 4426 prefix = "vectp"; 4427 break; 4428 default: 4429 gcc_unreachable (); 4430 } 4431 4432 if (name) 4433 { 4434 char* tmp = concat (prefix, "_", name, NULL); 4435 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4436 free (tmp); 4437 } 4438 else 4439 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4440 4441 return new_vect_var; 4442 } 4443 4444 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */ 4445 4446 static void 4447 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) 4448 { 4449 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); 4450 int misalign = DR_MISALIGNMENT (dr_info); 4451 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4452 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4453 else 4454 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4455 known_alignment (DR_TARGET_ALIGNMENT (dr_info)), 4456 misalign); 4457 } 4458 4459 /* Function vect_create_addr_base_for_vector_ref. 4460 4461 Create an expression that computes the address of the first memory location 4462 that will be accessed for a data reference. 4463 4464 Input: 4465 STMT_INFO: The statement containing the data reference. 4466 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4467 OFFSET: Optional. If supplied, it is be added to the initial address. 4468 LOOP: Specify relative to which loop-nest should the address be computed. 4469 For example, when the dataref is in an inner-loop nested in an 4470 outer-loop that is now being vectorized, LOOP can be either the 4471 outer-loop, or the inner-loop. The first memory location accessed 4472 by the following dataref ('in' points to short): 4473 4474 for (i=0; i<N; i++) 4475 for (j=0; j<M; j++) 4476 s += in[i+j] 4477 4478 is as follows: 4479 if LOOP=i_loop: &in (relative to i_loop) 4480 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4481 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4482 initial address. Unlike OFFSET, which is number of elements to 4483 be added, BYTE_OFFSET is measured in bytes. 4484 4485 Output: 4486 1. Return an SSA_NAME whose value is the address of the memory location of 4487 the first vector of the data reference. 4488 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4489 these statement(s) which define the returned SSA_NAME. 4490 4491 FORNOW: We are only handling array accesses with step 1. */ 4492 4493 tree 4494 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info, 4495 gimple_seq *new_stmt_list, 4496 tree offset, 4497 tree byte_offset) 4498 { 4499 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 4500 struct data_reference *dr = dr_info->dr; 4501 const char *base_name; 4502 tree addr_base; 4503 tree dest; 4504 gimple_seq seq = NULL; 4505 tree vect_ptr_type; 4506 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4507 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4508 innermost_loop_behavior *drb = vect_dr_behavior (dr_info); 4509 4510 tree data_ref_base = unshare_expr (drb->base_address); 4511 tree base_offset = unshare_expr (drb->offset); 4512 tree init = unshare_expr (drb->init); 4513 4514 if (loop_vinfo) 4515 base_name = get_name (data_ref_base); 4516 else 4517 { 4518 base_offset = ssize_int (0); 4519 init = ssize_int (0); 4520 base_name = get_name (DR_REF (dr)); 4521 } 4522 4523 /* Create base_offset */ 4524 base_offset = size_binop (PLUS_EXPR, 4525 fold_convert (sizetype, base_offset), 4526 fold_convert (sizetype, init)); 4527 4528 if (offset) 4529 { 4530 offset = fold_build2 (MULT_EXPR, sizetype, 4531 fold_convert (sizetype, offset), step); 4532 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4533 base_offset, offset); 4534 } 4535 if (byte_offset) 4536 { 4537 byte_offset = fold_convert (sizetype, byte_offset); 4538 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4539 base_offset, byte_offset); 4540 } 4541 4542 /* base + base_offset */ 4543 if (loop_vinfo) 4544 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4545 else 4546 { 4547 addr_base = build1 (ADDR_EXPR, 4548 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4549 unshare_expr (DR_REF (dr))); 4550 } 4551 4552 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4553 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4554 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4555 gimple_seq_add_seq (new_stmt_list, seq); 4556 4557 if (DR_PTR_INFO (dr) 4558 && TREE_CODE (addr_base) == SSA_NAME 4559 && !SSA_NAME_PTR_INFO (addr_base)) 4560 { 4561 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info); 4562 if (offset || byte_offset) 4563 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4564 } 4565 4566 if (dump_enabled_p ()) 4567 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base); 4568 4569 return addr_base; 4570 } 4571 4572 4573 /* Function vect_create_data_ref_ptr. 4574 4575 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4576 location accessed in the loop by STMT_INFO, along with the def-use update 4577 chain to appropriately advance the pointer through the loop iterations. 4578 Also set aliasing information for the pointer. This pointer is used by 4579 the callers to this function to create a memory reference expression for 4580 vector load/store access. 4581 4582 Input: 4583 1. STMT_INFO: a stmt that references memory. Expected to be of the form 4584 GIMPLE_ASSIGN <name, data-ref> or 4585 GIMPLE_ASSIGN <data-ref, name>. 4586 2. AGGR_TYPE: the type of the reference, which should be either a vector 4587 or an array. 4588 3. AT_LOOP: the loop where the vector memref is to be created. 4589 4. OFFSET (optional): an offset to be added to the initial address accessed 4590 by the data-ref in STMT_INFO. 4591 5. BSI: location where the new stmts are to be placed if there is no loop 4592 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4593 pointing to the initial address. 4594 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4595 to the initial address accessed by the data-ref in STMT_INFO. This is 4596 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4597 in bytes. 4598 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4599 to the IV during each iteration of the loop. NULL says to move 4600 by one copy of AGGR_TYPE up or down, depending on the step of the 4601 data reference. 4602 4603 Output: 4604 1. Declare a new ptr to vector_type, and have it point to the base of the 4605 data reference (initial addressed accessed by the data reference). 4606 For example, for vector of type V8HI, the following code is generated: 4607 4608 v8hi *ap; 4609 ap = (v8hi *)initial_address; 4610 4611 if OFFSET is not supplied: 4612 initial_address = &a[init]; 4613 if OFFSET is supplied: 4614 initial_address = &a[init + OFFSET]; 4615 if BYTE_OFFSET is supplied: 4616 initial_address = &a[init] + BYTE_OFFSET; 4617 4618 Return the initial_address in INITIAL_ADDRESS. 4619 4620 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4621 update the pointer in each iteration of the loop. 4622 4623 Return the increment stmt that updates the pointer in PTR_INCR. 4624 4625 3. Return the pointer. */ 4626 4627 tree 4628 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type, 4629 struct loop *at_loop, tree offset, 4630 tree *initial_address, gimple_stmt_iterator *gsi, 4631 gimple **ptr_incr, bool only_init, 4632 tree byte_offset, tree iv_step) 4633 { 4634 const char *base_name; 4635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4636 struct loop *loop = NULL; 4637 bool nested_in_vect_loop = false; 4638 struct loop *containing_loop = NULL; 4639 tree aggr_ptr_type; 4640 tree aggr_ptr; 4641 tree new_temp; 4642 gimple_seq new_stmt_list = NULL; 4643 edge pe = NULL; 4644 basic_block new_bb; 4645 tree aggr_ptr_init; 4646 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 4647 struct data_reference *dr = dr_info->dr; 4648 tree aptr; 4649 gimple_stmt_iterator incr_gsi; 4650 bool insert_after; 4651 tree indx_before_incr, indx_after_incr; 4652 gimple *incr; 4653 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4654 4655 gcc_assert (iv_step != NULL_TREE 4656 || TREE_CODE (aggr_type) == ARRAY_TYPE 4657 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4658 4659 if (loop_vinfo) 4660 { 4661 loop = LOOP_VINFO_LOOP (loop_vinfo); 4662 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); 4663 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; 4664 pe = loop_preheader_edge (loop); 4665 } 4666 else 4667 { 4668 gcc_assert (bb_vinfo); 4669 only_init = true; 4670 *ptr_incr = NULL; 4671 } 4672 4673 /* Create an expression for the first address accessed by this load 4674 in LOOP. */ 4675 base_name = get_name (DR_BASE_ADDRESS (dr)); 4676 4677 if (dump_enabled_p ()) 4678 { 4679 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4680 dump_printf_loc (MSG_NOTE, vect_location, 4681 "create %s-pointer variable to type: %T", 4682 get_tree_code_name (TREE_CODE (aggr_type)), 4683 aggr_type); 4684 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4685 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4686 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4687 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4688 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4689 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4690 else 4691 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4692 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr)); 4693 } 4694 4695 /* (1) Create the new aggregate-pointer variable. 4696 Vector and array types inherit the alias set of their component 4697 type by default so we need to use a ref-all pointer if the data 4698 reference does not conflict with the created aggregated data 4699 reference because it is not addressable. */ 4700 bool need_ref_all = false; 4701 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4702 get_alias_set (DR_REF (dr)))) 4703 need_ref_all = true; 4704 /* Likewise for any of the data references in the stmt group. */ 4705 else if (DR_GROUP_SIZE (stmt_info) > 1) 4706 { 4707 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); 4708 do 4709 { 4710 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4711 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4712 get_alias_set (DR_REF (sdr)))) 4713 { 4714 need_ref_all = true; 4715 break; 4716 } 4717 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); 4718 } 4719 while (sinfo); 4720 } 4721 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4722 need_ref_all); 4723 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4724 4725 4726 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4727 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4728 def-use update cycles for the pointer: one relative to the outer-loop 4729 (LOOP), which is what steps (3) and (4) below do. The other is relative 4730 to the inner-loop (which is the inner-most loop containing the dataref), 4731 and this is done be step (5) below. 4732 4733 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4734 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4735 redundant. Steps (3),(4) create the following: 4736 4737 vp0 = &base_addr; 4738 LOOP: vp1 = phi(vp0,vp2) 4739 ... 4740 ... 4741 vp2 = vp1 + step 4742 goto LOOP 4743 4744 If there is an inner-loop nested in loop, then step (5) will also be 4745 applied, and an additional update in the inner-loop will be created: 4746 4747 vp0 = &base_addr; 4748 LOOP: vp1 = phi(vp0,vp2) 4749 ... 4750 inner: vp3 = phi(vp1,vp4) 4751 vp4 = vp3 + inner_step 4752 if () goto inner 4753 ... 4754 vp2 = vp1 + step 4755 if () goto LOOP */ 4756 4757 /* (2) Calculate the initial address of the aggregate-pointer, and set 4758 the aggregate-pointer to point to it before the loop. */ 4759 4760 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4761 4762 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list, 4763 offset, byte_offset); 4764 if (new_stmt_list) 4765 { 4766 if (pe) 4767 { 4768 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4769 gcc_assert (!new_bb); 4770 } 4771 else 4772 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4773 } 4774 4775 *initial_address = new_temp; 4776 aggr_ptr_init = new_temp; 4777 4778 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4779 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4780 inner-loop nested in LOOP (during outer-loop vectorization). */ 4781 4782 /* No update in loop is required. */ 4783 if (only_init && (!loop_vinfo || at_loop == loop)) 4784 aptr = aggr_ptr_init; 4785 else 4786 { 4787 /* Accesses to invariant addresses should be handled specially 4788 by the caller. */ 4789 tree step = vect_dr_behavior (dr_info)->step; 4790 gcc_assert (!integer_zerop (step)); 4791 4792 if (iv_step == NULL_TREE) 4793 { 4794 /* The step of the aggregate pointer is the type size, 4795 negated for downward accesses. */ 4796 iv_step = TYPE_SIZE_UNIT (aggr_type); 4797 if (tree_int_cst_sgn (step) == -1) 4798 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4799 } 4800 4801 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4802 4803 create_iv (aggr_ptr_init, 4804 fold_convert (aggr_ptr_type, iv_step), 4805 aggr_ptr, loop, &incr_gsi, insert_after, 4806 &indx_before_incr, &indx_after_incr); 4807 incr = gsi_stmt (incr_gsi); 4808 loop_vinfo->add_stmt (incr); 4809 4810 /* Copy the points-to information if it exists. */ 4811 if (DR_PTR_INFO (dr)) 4812 { 4813 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); 4814 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); 4815 } 4816 if (ptr_incr) 4817 *ptr_incr = incr; 4818 4819 aptr = indx_before_incr; 4820 } 4821 4822 if (!nested_in_vect_loop || only_init) 4823 return aptr; 4824 4825 4826 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4827 nested in LOOP, if exists. */ 4828 4829 gcc_assert (nested_in_vect_loop); 4830 if (!only_init) 4831 { 4832 standard_iv_increment_position (containing_loop, &incr_gsi, 4833 &insert_after); 4834 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4835 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4836 &indx_after_incr); 4837 incr = gsi_stmt (incr_gsi); 4838 loop_vinfo->add_stmt (incr); 4839 4840 /* Copy the points-to information if it exists. */ 4841 if (DR_PTR_INFO (dr)) 4842 { 4843 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); 4844 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); 4845 } 4846 if (ptr_incr) 4847 *ptr_incr = incr; 4848 4849 return indx_before_incr; 4850 } 4851 else 4852 gcc_unreachable (); 4853 } 4854 4855 4856 /* Function bump_vector_ptr 4857 4858 Increment a pointer (to a vector type) by vector-size. If requested, 4859 i.e. if PTR-INCR is given, then also connect the new increment stmt 4860 to the existing def-use update-chain of the pointer, by modifying 4861 the PTR_INCR as illustrated below: 4862 4863 The pointer def-use update-chain before this function: 4864 DATAREF_PTR = phi (p_0, p_2) 4865 .... 4866 PTR_INCR: p_2 = DATAREF_PTR + step 4867 4868 The pointer def-use update-chain after this function: 4869 DATAREF_PTR = phi (p_0, p_2) 4870 .... 4871 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4872 .... 4873 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4874 4875 Input: 4876 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4877 in the loop. 4878 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4879 the loop. The increment amount across iterations is expected 4880 to be vector_size. 4881 BSI - location where the new update stmt is to be placed. 4882 STMT_INFO - the original scalar memory-access stmt that is being vectorized. 4883 BUMP - optional. The offset by which to bump the pointer. If not given, 4884 the offset is assumed to be vector_size. 4885 4886 Output: Return NEW_DATAREF_PTR as illustrated above. 4887 4888 */ 4889 4890 tree 4891 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4892 stmt_vec_info stmt_info, tree bump) 4893 { 4894 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4895 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4896 tree update = TYPE_SIZE_UNIT (vectype); 4897 gassign *incr_stmt; 4898 ssa_op_iter iter; 4899 use_operand_p use_p; 4900 tree new_dataref_ptr; 4901 4902 if (bump) 4903 update = bump; 4904 4905 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4906 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4907 else 4908 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4909 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4910 dataref_ptr, update); 4911 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi); 4912 4913 /* Copy the points-to information if it exists. */ 4914 if (DR_PTR_INFO (dr)) 4915 { 4916 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4917 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4918 } 4919 4920 if (!ptr_incr) 4921 return new_dataref_ptr; 4922 4923 /* Update the vector-pointer's cross-iteration increment. */ 4924 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4925 { 4926 tree use = USE_FROM_PTR (use_p); 4927 4928 if (use == dataref_ptr) 4929 SET_USE (use_p, new_dataref_ptr); 4930 else 4931 gcc_assert (operand_equal_p (use, update, 0)); 4932 } 4933 4934 return new_dataref_ptr; 4935 } 4936 4937 4938 /* Copy memory reference info such as base/clique from the SRC reference 4939 to the DEST MEM_REF. */ 4940 4941 void 4942 vect_copy_ref_info (tree dest, tree src) 4943 { 4944 if (TREE_CODE (dest) != MEM_REF) 4945 return; 4946 4947 tree src_base = src; 4948 while (handled_component_p (src_base)) 4949 src_base = TREE_OPERAND (src_base, 0); 4950 if (TREE_CODE (src_base) != MEM_REF 4951 && TREE_CODE (src_base) != TARGET_MEM_REF) 4952 return; 4953 4954 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 4955 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 4956 } 4957 4958 4959 /* Function vect_create_destination_var. 4960 4961 Create a new temporary of type VECTYPE. */ 4962 4963 tree 4964 vect_create_destination_var (tree scalar_dest, tree vectype) 4965 { 4966 tree vec_dest; 4967 const char *name; 4968 char *new_name; 4969 tree type; 4970 enum vect_var_kind kind; 4971 4972 kind = vectype 4973 ? VECTOR_BOOLEAN_TYPE_P (vectype) 4974 ? vect_mask_var 4975 : vect_simple_var 4976 : vect_scalar_var; 4977 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4978 4979 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4980 4981 name = get_name (scalar_dest); 4982 if (name) 4983 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 4984 else 4985 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 4986 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4987 free (new_name); 4988 4989 return vec_dest; 4990 } 4991 4992 /* Function vect_grouped_store_supported. 4993 4994 Returns TRUE if interleave high and interleave low permutations 4995 are supported, and FALSE otherwise. */ 4996 4997 bool 4998 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4999 { 5000 machine_mode mode = TYPE_MODE (vectype); 5001 5002 /* vect_permute_store_chain requires the group size to be equal to 3 or 5003 be a power of two. */ 5004 if (count != 3 && exact_log2 (count) == -1) 5005 { 5006 if (dump_enabled_p ()) 5007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5008 "the size of the group of accesses" 5009 " is not a power of 2 or not eqaul to 3\n"); 5010 return false; 5011 } 5012 5013 /* Check that the permutation is supported. */ 5014 if (VECTOR_MODE_P (mode)) 5015 { 5016 unsigned int i; 5017 if (count == 3) 5018 { 5019 unsigned int j0 = 0, j1 = 0, j2 = 0; 5020 unsigned int i, j; 5021 5022 unsigned int nelt; 5023 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5024 { 5025 if (dump_enabled_p ()) 5026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5027 "cannot handle groups of 3 stores for" 5028 " variable-length vectors\n"); 5029 return false; 5030 } 5031 5032 vec_perm_builder sel (nelt, nelt, 1); 5033 sel.quick_grow (nelt); 5034 vec_perm_indices indices; 5035 for (j = 0; j < 3; j++) 5036 { 5037 int nelt0 = ((3 - j) * nelt) % 3; 5038 int nelt1 = ((3 - j) * nelt + 1) % 3; 5039 int nelt2 = ((3 - j) * nelt + 2) % 3; 5040 for (i = 0; i < nelt; i++) 5041 { 5042 if (3 * i + nelt0 < nelt) 5043 sel[3 * i + nelt0] = j0++; 5044 if (3 * i + nelt1 < nelt) 5045 sel[3 * i + nelt1] = nelt + j1++; 5046 if (3 * i + nelt2 < nelt) 5047 sel[3 * i + nelt2] = 0; 5048 } 5049 indices.new_vector (sel, 2, nelt); 5050 if (!can_vec_perm_const_p (mode, indices)) 5051 { 5052 if (dump_enabled_p ()) 5053 dump_printf (MSG_MISSED_OPTIMIZATION, 5054 "permutation op not supported by target.\n"); 5055 return false; 5056 } 5057 5058 for (i = 0; i < nelt; i++) 5059 { 5060 if (3 * i + nelt0 < nelt) 5061 sel[3 * i + nelt0] = 3 * i + nelt0; 5062 if (3 * i + nelt1 < nelt) 5063 sel[3 * i + nelt1] = 3 * i + nelt1; 5064 if (3 * i + nelt2 < nelt) 5065 sel[3 * i + nelt2] = nelt + j2++; 5066 } 5067 indices.new_vector (sel, 2, nelt); 5068 if (!can_vec_perm_const_p (mode, indices)) 5069 { 5070 if (dump_enabled_p ()) 5071 dump_printf (MSG_MISSED_OPTIMIZATION, 5072 "permutation op not supported by target.\n"); 5073 return false; 5074 } 5075 } 5076 return true; 5077 } 5078 else 5079 { 5080 /* If length is not equal to 3 then only power of 2 is supported. */ 5081 gcc_assert (pow2p_hwi (count)); 5082 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5083 5084 /* The encoding has 2 interleaved stepped patterns. */ 5085 vec_perm_builder sel (nelt, 2, 3); 5086 sel.quick_grow (6); 5087 for (i = 0; i < 3; i++) 5088 { 5089 sel[i * 2] = i; 5090 sel[i * 2 + 1] = i + nelt; 5091 } 5092 vec_perm_indices indices (sel, 2, nelt); 5093 if (can_vec_perm_const_p (mode, indices)) 5094 { 5095 for (i = 0; i < 6; i++) 5096 sel[i] += exact_div (nelt, 2); 5097 indices.new_vector (sel, 2, nelt); 5098 if (can_vec_perm_const_p (mode, indices)) 5099 return true; 5100 } 5101 } 5102 } 5103 5104 if (dump_enabled_p ()) 5105 dump_printf (MSG_MISSED_OPTIMIZATION, 5106 "permutation op not supported by target.\n"); 5107 return false; 5108 } 5109 5110 5111 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5112 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5113 5114 bool 5115 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5116 bool masked_p) 5117 { 5118 if (masked_p) 5119 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5120 vec_mask_store_lanes_optab, 5121 vectype, count); 5122 else 5123 return vect_lanes_optab_supported_p ("vec_store_lanes", 5124 vec_store_lanes_optab, 5125 vectype, count); 5126 } 5127 5128 5129 /* Function vect_permute_store_chain. 5130 5131 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5132 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5133 the data correctly for the stores. Return the final references for stores 5134 in RESULT_CHAIN. 5135 5136 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5137 The input is 4 vectors each containing 8 elements. We assign a number to 5138 each element, the input sequence is: 5139 5140 1st vec: 0 1 2 3 4 5 6 7 5141 2nd vec: 8 9 10 11 12 13 14 15 5142 3rd vec: 16 17 18 19 20 21 22 23 5143 4th vec: 24 25 26 27 28 29 30 31 5144 5145 The output sequence should be: 5146 5147 1st vec: 0 8 16 24 1 9 17 25 5148 2nd vec: 2 10 18 26 3 11 19 27 5149 3rd vec: 4 12 20 28 5 13 21 30 5150 4th vec: 6 14 22 30 7 15 23 31 5151 5152 i.e., we interleave the contents of the four vectors in their order. 5153 5154 We use interleave_high/low instructions to create such output. The input of 5155 each interleave_high/low operation is two vectors: 5156 1st vec 2nd vec 5157 0 1 2 3 4 5 6 7 5158 the even elements of the result vector are obtained left-to-right from the 5159 high/low elements of the first vector. The odd elements of the result are 5160 obtained left-to-right from the high/low elements of the second vector. 5161 The output of interleave_high will be: 0 4 1 5 5162 and of interleave_low: 2 6 3 7 5163 5164 5165 The permutation is done in log LENGTH stages. In each stage interleave_high 5166 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5167 where the first argument is taken from the first half of DR_CHAIN and the 5168 second argument from it's second half. 5169 In our example, 5170 5171 I1: interleave_high (1st vec, 3rd vec) 5172 I2: interleave_low (1st vec, 3rd vec) 5173 I3: interleave_high (2nd vec, 4th vec) 5174 I4: interleave_low (2nd vec, 4th vec) 5175 5176 The output for the first stage is: 5177 5178 I1: 0 16 1 17 2 18 3 19 5179 I2: 4 20 5 21 6 22 7 23 5180 I3: 8 24 9 25 10 26 11 27 5181 I4: 12 28 13 29 14 30 15 31 5182 5183 The output of the second stage, i.e. the final result is: 5184 5185 I1: 0 8 16 24 1 9 17 25 5186 I2: 2 10 18 26 3 11 19 27 5187 I3: 4 12 20 28 5 13 21 30 5188 I4: 6 14 22 30 7 15 23 31. */ 5189 5190 void 5191 vect_permute_store_chain (vec<tree> dr_chain, 5192 unsigned int length, 5193 stmt_vec_info stmt_info, 5194 gimple_stmt_iterator *gsi, 5195 vec<tree> *result_chain) 5196 { 5197 tree vect1, vect2, high, low; 5198 gimple *perm_stmt; 5199 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5200 tree perm_mask_low, perm_mask_high; 5201 tree data_ref; 5202 tree perm3_mask_low, perm3_mask_high; 5203 unsigned int i, j, n, log_length = exact_log2 (length); 5204 5205 result_chain->quick_grow (length); 5206 memcpy (result_chain->address (), dr_chain.address (), 5207 length * sizeof (tree)); 5208 5209 if (length == 3) 5210 { 5211 /* vect_grouped_store_supported ensures that this is constant. */ 5212 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5213 unsigned int j0 = 0, j1 = 0, j2 = 0; 5214 5215 vec_perm_builder sel (nelt, nelt, 1); 5216 sel.quick_grow (nelt); 5217 vec_perm_indices indices; 5218 for (j = 0; j < 3; j++) 5219 { 5220 int nelt0 = ((3 - j) * nelt) % 3; 5221 int nelt1 = ((3 - j) * nelt + 1) % 3; 5222 int nelt2 = ((3 - j) * nelt + 2) % 3; 5223 5224 for (i = 0; i < nelt; i++) 5225 { 5226 if (3 * i + nelt0 < nelt) 5227 sel[3 * i + nelt0] = j0++; 5228 if (3 * i + nelt1 < nelt) 5229 sel[3 * i + nelt1] = nelt + j1++; 5230 if (3 * i + nelt2 < nelt) 5231 sel[3 * i + nelt2] = 0; 5232 } 5233 indices.new_vector (sel, 2, nelt); 5234 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5235 5236 for (i = 0; i < nelt; i++) 5237 { 5238 if (3 * i + nelt0 < nelt) 5239 sel[3 * i + nelt0] = 3 * i + nelt0; 5240 if (3 * i + nelt1 < nelt) 5241 sel[3 * i + nelt1] = 3 * i + nelt1; 5242 if (3 * i + nelt2 < nelt) 5243 sel[3 * i + nelt2] = nelt + j2++; 5244 } 5245 indices.new_vector (sel, 2, nelt); 5246 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5247 5248 vect1 = dr_chain[0]; 5249 vect2 = dr_chain[1]; 5250 5251 /* Create interleaving stmt: 5252 low = VEC_PERM_EXPR <vect1, vect2, 5253 {j, nelt, *, j + 1, nelt + j + 1, *, 5254 j + 2, nelt + j + 2, *, ...}> */ 5255 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5256 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5257 vect2, perm3_mask_low); 5258 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5259 5260 vect1 = data_ref; 5261 vect2 = dr_chain[2]; 5262 /* Create interleaving stmt: 5263 low = VEC_PERM_EXPR <vect1, vect2, 5264 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5265 6, 7, nelt + j + 2, ...}> */ 5266 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5267 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5268 vect2, perm3_mask_high); 5269 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5270 (*result_chain)[j] = data_ref; 5271 } 5272 } 5273 else 5274 { 5275 /* If length is not equal to 3 then only power of 2 is supported. */ 5276 gcc_assert (pow2p_hwi (length)); 5277 5278 /* The encoding has 2 interleaved stepped patterns. */ 5279 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5280 vec_perm_builder sel (nelt, 2, 3); 5281 sel.quick_grow (6); 5282 for (i = 0; i < 3; i++) 5283 { 5284 sel[i * 2] = i; 5285 sel[i * 2 + 1] = i + nelt; 5286 } 5287 vec_perm_indices indices (sel, 2, nelt); 5288 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5289 5290 for (i = 0; i < 6; i++) 5291 sel[i] += exact_div (nelt, 2); 5292 indices.new_vector (sel, 2, nelt); 5293 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5294 5295 for (i = 0, n = log_length; i < n; i++) 5296 { 5297 for (j = 0; j < length/2; j++) 5298 { 5299 vect1 = dr_chain[j]; 5300 vect2 = dr_chain[j+length/2]; 5301 5302 /* Create interleaving stmt: 5303 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5304 ...}> */ 5305 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5306 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5307 vect2, perm_mask_high); 5308 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5309 (*result_chain)[2*j] = high; 5310 5311 /* Create interleaving stmt: 5312 low = VEC_PERM_EXPR <vect1, vect2, 5313 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5314 ...}> */ 5315 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5316 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5317 vect2, perm_mask_low); 5318 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5319 (*result_chain)[2*j+1] = low; 5320 } 5321 memcpy (dr_chain.address (), result_chain->address (), 5322 length * sizeof (tree)); 5323 } 5324 } 5325 } 5326 5327 /* Function vect_setup_realignment 5328 5329 This function is called when vectorizing an unaligned load using 5330 the dr_explicit_realign[_optimized] scheme. 5331 This function generates the following code at the loop prolog: 5332 5333 p = initial_addr; 5334 x msq_init = *(floor(p)); # prolog load 5335 realignment_token = call target_builtin; 5336 loop: 5337 x msq = phi (msq_init, ---) 5338 5339 The stmts marked with x are generated only for the case of 5340 dr_explicit_realign_optimized. 5341 5342 The code above sets up a new (vector) pointer, pointing to the first 5343 location accessed by STMT_INFO, and a "floor-aligned" load using that 5344 pointer. It also generates code to compute the "realignment-token" 5345 (if the relevant target hook was defined), and creates a phi-node at the 5346 loop-header bb whose arguments are the result of the prolog-load (created 5347 by this function) and the result of a load that takes place in the loop 5348 (to be created by the caller to this function). 5349 5350 For the case of dr_explicit_realign_optimized: 5351 The caller to this function uses the phi-result (msq) to create the 5352 realignment code inside the loop, and sets up the missing phi argument, 5353 as follows: 5354 loop: 5355 msq = phi (msq_init, lsq) 5356 lsq = *(floor(p')); # load in loop 5357 result = realign_load (msq, lsq, realignment_token); 5358 5359 For the case of dr_explicit_realign: 5360 loop: 5361 msq = *(floor(p)); # load in loop 5362 p' = p + (VS-1); 5363 lsq = *(floor(p')); # load in loop 5364 result = realign_load (msq, lsq, realignment_token); 5365 5366 Input: 5367 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses 5368 a memory location that may be unaligned. 5369 BSI - place where new code is to be inserted. 5370 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5371 is used. 5372 5373 Output: 5374 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5375 target hook, if defined. 5376 Return value - the result of the loop-header phi node. */ 5377 5378 tree 5379 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 5380 tree *realignment_token, 5381 enum dr_alignment_support alignment_support_scheme, 5382 tree init_addr, 5383 struct loop **at_loop) 5384 { 5385 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5386 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5387 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 5388 struct data_reference *dr = dr_info->dr; 5389 struct loop *loop = NULL; 5390 edge pe = NULL; 5391 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt); 5392 tree vec_dest; 5393 gimple *inc; 5394 tree ptr; 5395 tree data_ref; 5396 basic_block new_bb; 5397 tree msq_init = NULL_TREE; 5398 tree new_temp; 5399 gphi *phi_stmt; 5400 tree msq = NULL_TREE; 5401 gimple_seq stmts = NULL; 5402 bool compute_in_loop = false; 5403 bool nested_in_vect_loop = false; 5404 struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; 5405 struct loop *loop_for_initial_load = NULL; 5406 5407 if (loop_vinfo) 5408 { 5409 loop = LOOP_VINFO_LOOP (loop_vinfo); 5410 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); 5411 } 5412 5413 gcc_assert (alignment_support_scheme == dr_explicit_realign 5414 || alignment_support_scheme == dr_explicit_realign_optimized); 5415 5416 /* We need to generate three things: 5417 1. the misalignment computation 5418 2. the extra vector load (for the optimized realignment scheme). 5419 3. the phi node for the two vectors from which the realignment is 5420 done (for the optimized realignment scheme). */ 5421 5422 /* 1. Determine where to generate the misalignment computation. 5423 5424 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5425 calculation will be generated by this function, outside the loop (in the 5426 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5427 caller, inside the loop. 5428 5429 Background: If the misalignment remains fixed throughout the iterations of 5430 the loop, then both realignment schemes are applicable, and also the 5431 misalignment computation can be done outside LOOP. This is because we are 5432 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5433 are a multiple of VS (the Vector Size), and therefore the misalignment in 5434 different vectorized LOOP iterations is always the same. 5435 The problem arises only if the memory access is in an inner-loop nested 5436 inside LOOP, which is now being vectorized using outer-loop vectorization. 5437 This is the only case when the misalignment of the memory access may not 5438 remain fixed throughout the iterations of the inner-loop (as explained in 5439 detail in vect_supportable_dr_alignment). In this case, not only is the 5440 optimized realignment scheme not applicable, but also the misalignment 5441 computation (and generation of the realignment token that is passed to 5442 REALIGN_LOAD) have to be done inside the loop. 5443 5444 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5445 or not, which in turn determines if the misalignment is computed inside 5446 the inner-loop, or outside LOOP. */ 5447 5448 if (init_addr != NULL_TREE || !loop_vinfo) 5449 { 5450 compute_in_loop = true; 5451 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5452 } 5453 5454 5455 /* 2. Determine where to generate the extra vector load. 5456 5457 For the optimized realignment scheme, instead of generating two vector 5458 loads in each iteration, we generate a single extra vector load in the 5459 preheader of the loop, and in each iteration reuse the result of the 5460 vector load from the previous iteration. In case the memory access is in 5461 an inner-loop nested inside LOOP, which is now being vectorized using 5462 outer-loop vectorization, we need to determine whether this initial vector 5463 load should be generated at the preheader of the inner-loop, or can be 5464 generated at the preheader of LOOP. If the memory access has no evolution 5465 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5466 to be generated inside LOOP (in the preheader of the inner-loop). */ 5467 5468 if (nested_in_vect_loop) 5469 { 5470 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5471 bool invariant_in_outerloop = 5472 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5473 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5474 } 5475 else 5476 loop_for_initial_load = loop; 5477 if (at_loop) 5478 *at_loop = loop_for_initial_load; 5479 5480 if (loop_for_initial_load) 5481 pe = loop_preheader_edge (loop_for_initial_load); 5482 5483 /* 3. For the case of the optimized realignment, create the first vector 5484 load at the loop preheader. */ 5485 5486 if (alignment_support_scheme == dr_explicit_realign_optimized) 5487 { 5488 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5489 gassign *new_stmt; 5490 5491 gcc_assert (!compute_in_loop); 5492 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5493 ptr = vect_create_data_ref_ptr (stmt_info, vectype, 5494 loop_for_initial_load, NULL_TREE, 5495 &init_addr, NULL, &inc, true); 5496 if (TREE_CODE (ptr) == SSA_NAME) 5497 new_temp = copy_ssa_name (ptr); 5498 else 5499 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5500 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); 5501 tree type = TREE_TYPE (ptr); 5502 new_stmt = gimple_build_assign 5503 (new_temp, BIT_AND_EXPR, ptr, 5504 fold_build2 (MINUS_EXPR, type, 5505 build_int_cst (type, 0), 5506 build_int_cst (type, align))); 5507 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5508 gcc_assert (!new_bb); 5509 data_ref 5510 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5511 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5512 vect_copy_ref_info (data_ref, DR_REF (dr)); 5513 new_stmt = gimple_build_assign (vec_dest, data_ref); 5514 new_temp = make_ssa_name (vec_dest, new_stmt); 5515 gimple_assign_set_lhs (new_stmt, new_temp); 5516 if (pe) 5517 { 5518 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5519 gcc_assert (!new_bb); 5520 } 5521 else 5522 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5523 5524 msq_init = gimple_assign_lhs (new_stmt); 5525 } 5526 5527 /* 4. Create realignment token using a target builtin, if available. 5528 It is done either inside the containing loop, or before LOOP (as 5529 determined above). */ 5530 5531 if (targetm.vectorize.builtin_mask_for_load) 5532 { 5533 gcall *new_stmt; 5534 tree builtin_decl; 5535 5536 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5537 if (!init_addr) 5538 { 5539 /* Generate the INIT_ADDR computation outside LOOP. */ 5540 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts, 5541 NULL_TREE); 5542 if (loop) 5543 { 5544 pe = loop_preheader_edge (loop); 5545 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5546 gcc_assert (!new_bb); 5547 } 5548 else 5549 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5550 } 5551 5552 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5553 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5554 vec_dest = 5555 vect_create_destination_var (scalar_dest, 5556 gimple_call_return_type (new_stmt)); 5557 new_temp = make_ssa_name (vec_dest, new_stmt); 5558 gimple_call_set_lhs (new_stmt, new_temp); 5559 5560 if (compute_in_loop) 5561 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5562 else 5563 { 5564 /* Generate the misalignment computation outside LOOP. */ 5565 pe = loop_preheader_edge (loop); 5566 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5567 gcc_assert (!new_bb); 5568 } 5569 5570 *realignment_token = gimple_call_lhs (new_stmt); 5571 5572 /* The result of the CALL_EXPR to this builtin is determined from 5573 the value of the parameter and no global variables are touched 5574 which makes the builtin a "const" function. Requiring the 5575 builtin to have the "const" attribute makes it unnecessary 5576 to call mark_call_clobbered. */ 5577 gcc_assert (TREE_READONLY (builtin_decl)); 5578 } 5579 5580 if (alignment_support_scheme == dr_explicit_realign) 5581 return msq; 5582 5583 gcc_assert (!compute_in_loop); 5584 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5585 5586 5587 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5588 5589 pe = loop_preheader_edge (containing_loop); 5590 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5591 msq = make_ssa_name (vec_dest); 5592 phi_stmt = create_phi_node (msq, containing_loop->header); 5593 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5594 5595 return msq; 5596 } 5597 5598 5599 /* Function vect_grouped_load_supported. 5600 5601 COUNT is the size of the load group (the number of statements plus the 5602 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5603 only one statement, with a gap of COUNT - 1. 5604 5605 Returns true if a suitable permute exists. */ 5606 5607 bool 5608 vect_grouped_load_supported (tree vectype, bool single_element_p, 5609 unsigned HOST_WIDE_INT count) 5610 { 5611 machine_mode mode = TYPE_MODE (vectype); 5612 5613 /* If this is single-element interleaving with an element distance 5614 that leaves unused vector loads around punt - we at least create 5615 very sub-optimal code in that case (and blow up memory, 5616 see PR65518). */ 5617 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5618 { 5619 if (dump_enabled_p ()) 5620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5621 "single-element interleaving not supported " 5622 "for not adjacent vector loads\n"); 5623 return false; 5624 } 5625 5626 /* vect_permute_load_chain requires the group size to be equal to 3 or 5627 be a power of two. */ 5628 if (count != 3 && exact_log2 (count) == -1) 5629 { 5630 if (dump_enabled_p ()) 5631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5632 "the size of the group of accesses" 5633 " is not a power of 2 or not equal to 3\n"); 5634 return false; 5635 } 5636 5637 /* Check that the permutation is supported. */ 5638 if (VECTOR_MODE_P (mode)) 5639 { 5640 unsigned int i, j; 5641 if (count == 3) 5642 { 5643 unsigned int nelt; 5644 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5645 { 5646 if (dump_enabled_p ()) 5647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5648 "cannot handle groups of 3 loads for" 5649 " variable-length vectors\n"); 5650 return false; 5651 } 5652 5653 vec_perm_builder sel (nelt, nelt, 1); 5654 sel.quick_grow (nelt); 5655 vec_perm_indices indices; 5656 unsigned int k; 5657 for (k = 0; k < 3; k++) 5658 { 5659 for (i = 0; i < nelt; i++) 5660 if (3 * i + k < 2 * nelt) 5661 sel[i] = 3 * i + k; 5662 else 5663 sel[i] = 0; 5664 indices.new_vector (sel, 2, nelt); 5665 if (!can_vec_perm_const_p (mode, indices)) 5666 { 5667 if (dump_enabled_p ()) 5668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5669 "shuffle of 3 loads is not supported by" 5670 " target\n"); 5671 return false; 5672 } 5673 for (i = 0, j = 0; i < nelt; i++) 5674 if (3 * i + k < 2 * nelt) 5675 sel[i] = i; 5676 else 5677 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5678 indices.new_vector (sel, 2, nelt); 5679 if (!can_vec_perm_const_p (mode, indices)) 5680 { 5681 if (dump_enabled_p ()) 5682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5683 "shuffle of 3 loads is not supported by" 5684 " target\n"); 5685 return false; 5686 } 5687 } 5688 return true; 5689 } 5690 else 5691 { 5692 /* If length is not equal to 3 then only power of 2 is supported. */ 5693 gcc_assert (pow2p_hwi (count)); 5694 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5695 5696 /* The encoding has a single stepped pattern. */ 5697 vec_perm_builder sel (nelt, 1, 3); 5698 sel.quick_grow (3); 5699 for (i = 0; i < 3; i++) 5700 sel[i] = i * 2; 5701 vec_perm_indices indices (sel, 2, nelt); 5702 if (can_vec_perm_const_p (mode, indices)) 5703 { 5704 for (i = 0; i < 3; i++) 5705 sel[i] = i * 2 + 1; 5706 indices.new_vector (sel, 2, nelt); 5707 if (can_vec_perm_const_p (mode, indices)) 5708 return true; 5709 } 5710 } 5711 } 5712 5713 if (dump_enabled_p ()) 5714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5715 "extract even/odd not supported by target\n"); 5716 return false; 5717 } 5718 5719 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5720 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5721 5722 bool 5723 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5724 bool masked_p) 5725 { 5726 if (masked_p) 5727 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5728 vec_mask_load_lanes_optab, 5729 vectype, count); 5730 else 5731 return vect_lanes_optab_supported_p ("vec_load_lanes", 5732 vec_load_lanes_optab, 5733 vectype, count); 5734 } 5735 5736 /* Function vect_permute_load_chain. 5737 5738 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5739 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5740 the input data correctly. Return the final references for loads in 5741 RESULT_CHAIN. 5742 5743 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5744 The input is 4 vectors each containing 8 elements. We assign a number to each 5745 element, the input sequence is: 5746 5747 1st vec: 0 1 2 3 4 5 6 7 5748 2nd vec: 8 9 10 11 12 13 14 15 5749 3rd vec: 16 17 18 19 20 21 22 23 5750 4th vec: 24 25 26 27 28 29 30 31 5751 5752 The output sequence should be: 5753 5754 1st vec: 0 4 8 12 16 20 24 28 5755 2nd vec: 1 5 9 13 17 21 25 29 5756 3rd vec: 2 6 10 14 18 22 26 30 5757 4th vec: 3 7 11 15 19 23 27 31 5758 5759 i.e., the first output vector should contain the first elements of each 5760 interleaving group, etc. 5761 5762 We use extract_even/odd instructions to create such output. The input of 5763 each extract_even/odd operation is two vectors 5764 1st vec 2nd vec 5765 0 1 2 3 4 5 6 7 5766 5767 and the output is the vector of extracted even/odd elements. The output of 5768 extract_even will be: 0 2 4 6 5769 and of extract_odd: 1 3 5 7 5770 5771 5772 The permutation is done in log LENGTH stages. In each stage extract_even 5773 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5774 their order. In our example, 5775 5776 E1: extract_even (1st vec, 2nd vec) 5777 E2: extract_odd (1st vec, 2nd vec) 5778 E3: extract_even (3rd vec, 4th vec) 5779 E4: extract_odd (3rd vec, 4th vec) 5780 5781 The output for the first stage will be: 5782 5783 E1: 0 2 4 6 8 10 12 14 5784 E2: 1 3 5 7 9 11 13 15 5785 E3: 16 18 20 22 24 26 28 30 5786 E4: 17 19 21 23 25 27 29 31 5787 5788 In order to proceed and create the correct sequence for the next stage (or 5789 for the correct output, if the second stage is the last one, as in our 5790 example), we first put the output of extract_even operation and then the 5791 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5792 The input for the second stage is: 5793 5794 1st vec (E1): 0 2 4 6 8 10 12 14 5795 2nd vec (E3): 16 18 20 22 24 26 28 30 5796 3rd vec (E2): 1 3 5 7 9 11 13 15 5797 4th vec (E4): 17 19 21 23 25 27 29 31 5798 5799 The output of the second stage: 5800 5801 E1: 0 4 8 12 16 20 24 28 5802 E2: 2 6 10 14 18 22 26 30 5803 E3: 1 5 9 13 17 21 25 29 5804 E4: 3 7 11 15 19 23 27 31 5805 5806 And RESULT_CHAIN after reordering: 5807 5808 1st vec (E1): 0 4 8 12 16 20 24 28 5809 2nd vec (E3): 1 5 9 13 17 21 25 29 5810 3rd vec (E2): 2 6 10 14 18 22 26 30 5811 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5812 5813 static void 5814 vect_permute_load_chain (vec<tree> dr_chain, 5815 unsigned int length, 5816 stmt_vec_info stmt_info, 5817 gimple_stmt_iterator *gsi, 5818 vec<tree> *result_chain) 5819 { 5820 tree data_ref, first_vect, second_vect; 5821 tree perm_mask_even, perm_mask_odd; 5822 tree perm3_mask_low, perm3_mask_high; 5823 gimple *perm_stmt; 5824 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5825 unsigned int i, j, log_length = exact_log2 (length); 5826 5827 result_chain->quick_grow (length); 5828 memcpy (result_chain->address (), dr_chain.address (), 5829 length * sizeof (tree)); 5830 5831 if (length == 3) 5832 { 5833 /* vect_grouped_load_supported ensures that this is constant. */ 5834 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5835 unsigned int k; 5836 5837 vec_perm_builder sel (nelt, nelt, 1); 5838 sel.quick_grow (nelt); 5839 vec_perm_indices indices; 5840 for (k = 0; k < 3; k++) 5841 { 5842 for (i = 0; i < nelt; i++) 5843 if (3 * i + k < 2 * nelt) 5844 sel[i] = 3 * i + k; 5845 else 5846 sel[i] = 0; 5847 indices.new_vector (sel, 2, nelt); 5848 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5849 5850 for (i = 0, j = 0; i < nelt; i++) 5851 if (3 * i + k < 2 * nelt) 5852 sel[i] = i; 5853 else 5854 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5855 indices.new_vector (sel, 2, nelt); 5856 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5857 5858 first_vect = dr_chain[0]; 5859 second_vect = dr_chain[1]; 5860 5861 /* Create interleaving stmt (low part of): 5862 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5863 ...}> */ 5864 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5865 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5866 second_vect, perm3_mask_low); 5867 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5868 5869 /* Create interleaving stmt (high part of): 5870 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5871 ...}> */ 5872 first_vect = data_ref; 5873 second_vect = dr_chain[2]; 5874 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5875 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5876 second_vect, perm3_mask_high); 5877 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5878 (*result_chain)[k] = data_ref; 5879 } 5880 } 5881 else 5882 { 5883 /* If length is not equal to 3 then only power of 2 is supported. */ 5884 gcc_assert (pow2p_hwi (length)); 5885 5886 /* The encoding has a single stepped pattern. */ 5887 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5888 vec_perm_builder sel (nelt, 1, 3); 5889 sel.quick_grow (3); 5890 for (i = 0; i < 3; ++i) 5891 sel[i] = i * 2; 5892 vec_perm_indices indices (sel, 2, nelt); 5893 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 5894 5895 for (i = 0; i < 3; ++i) 5896 sel[i] = i * 2 + 1; 5897 indices.new_vector (sel, 2, nelt); 5898 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 5899 5900 for (i = 0; i < log_length; i++) 5901 { 5902 for (j = 0; j < length; j += 2) 5903 { 5904 first_vect = dr_chain[j]; 5905 second_vect = dr_chain[j+1]; 5906 5907 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5908 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5909 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5910 first_vect, second_vect, 5911 perm_mask_even); 5912 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5913 (*result_chain)[j/2] = data_ref; 5914 5915 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5916 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5917 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5918 first_vect, second_vect, 5919 perm_mask_odd); 5920 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5921 (*result_chain)[j/2+length/2] = data_ref; 5922 } 5923 memcpy (dr_chain.address (), result_chain->address (), 5924 length * sizeof (tree)); 5925 } 5926 } 5927 } 5928 5929 /* Function vect_shift_permute_load_chain. 5930 5931 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 5932 sequence of stmts to reorder the input data accordingly. 5933 Return the final references for loads in RESULT_CHAIN. 5934 Return true if successed, false otherwise. 5935 5936 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 5937 The input is 3 vectors each containing 8 elements. We assign a 5938 number to each element, the input sequence is: 5939 5940 1st vec: 0 1 2 3 4 5 6 7 5941 2nd vec: 8 9 10 11 12 13 14 15 5942 3rd vec: 16 17 18 19 20 21 22 23 5943 5944 The output sequence should be: 5945 5946 1st vec: 0 3 6 9 12 15 18 21 5947 2nd vec: 1 4 7 10 13 16 19 22 5948 3rd vec: 2 5 8 11 14 17 20 23 5949 5950 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 5951 5952 First we shuffle all 3 vectors to get correct elements order: 5953 5954 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 5955 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 5956 3rd vec: (16 19 22) (17 20 23) (18 21) 5957 5958 Next we unite and shift vector 3 times: 5959 5960 1st step: 5961 shift right by 6 the concatenation of: 5962 "1st vec" and "2nd vec" 5963 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 5964 "2nd vec" and "3rd vec" 5965 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 5966 "3rd vec" and "1st vec" 5967 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 5968 | New vectors | 5969 5970 So that now new vectors are: 5971 5972 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 5973 2nd vec: (10 13) (16 19 22) (17 20 23) 5974 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 5975 5976 2nd step: 5977 shift right by 5 the concatenation of: 5978 "1st vec" and "3rd vec" 5979 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 5980 "2nd vec" and "1st vec" 5981 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 5982 "3rd vec" and "2nd vec" 5983 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 5984 | New vectors | 5985 5986 So that now new vectors are: 5987 5988 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 5989 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 5990 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 5991 5992 3rd step: 5993 shift right by 5 the concatenation of: 5994 "1st vec" and "1st vec" 5995 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 5996 shift right by 3 the concatenation of: 5997 "2nd vec" and "2nd vec" 5998 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 5999 | New vectors | 6000 6001 So that now all vectors are READY: 6002 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6003 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6004 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6005 6006 This algorithm is faster than one in vect_permute_load_chain if: 6007 1. "shift of a concatination" is faster than general permutation. 6008 This is usually so. 6009 2. The TARGET machine can't execute vector instructions in parallel. 6010 This is because each step of the algorithm depends on previous. 6011 The algorithm in vect_permute_load_chain is much more parallel. 6012 6013 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6014 */ 6015 6016 static bool 6017 vect_shift_permute_load_chain (vec<tree> dr_chain, 6018 unsigned int length, 6019 stmt_vec_info stmt_info, 6020 gimple_stmt_iterator *gsi, 6021 vec<tree> *result_chain) 6022 { 6023 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6024 tree perm2_mask1, perm2_mask2, perm3_mask; 6025 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6026 gimple *perm_stmt; 6027 6028 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6029 unsigned int i; 6030 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6031 6032 unsigned HOST_WIDE_INT nelt, vf; 6033 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6034 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6035 /* Not supported for variable-length vectors. */ 6036 return false; 6037 6038 vec_perm_builder sel (nelt, nelt, 1); 6039 sel.quick_grow (nelt); 6040 6041 result_chain->quick_grow (length); 6042 memcpy (result_chain->address (), dr_chain.address (), 6043 length * sizeof (tree)); 6044 6045 if (pow2p_hwi (length) && vf > 4) 6046 { 6047 unsigned int j, log_length = exact_log2 (length); 6048 for (i = 0; i < nelt / 2; ++i) 6049 sel[i] = i * 2; 6050 for (i = 0; i < nelt / 2; ++i) 6051 sel[nelt / 2 + i] = i * 2 + 1; 6052 vec_perm_indices indices (sel, 2, nelt); 6053 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6054 { 6055 if (dump_enabled_p ()) 6056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6057 "shuffle of 2 fields structure is not \ 6058 supported by target\n"); 6059 return false; 6060 } 6061 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6062 6063 for (i = 0; i < nelt / 2; ++i) 6064 sel[i] = i * 2 + 1; 6065 for (i = 0; i < nelt / 2; ++i) 6066 sel[nelt / 2 + i] = i * 2; 6067 indices.new_vector (sel, 2, nelt); 6068 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6069 { 6070 if (dump_enabled_p ()) 6071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6072 "shuffle of 2 fields structure is not \ 6073 supported by target\n"); 6074 return false; 6075 } 6076 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6077 6078 /* Generating permutation constant to shift all elements. 6079 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6080 for (i = 0; i < nelt; i++) 6081 sel[i] = nelt / 2 + i; 6082 indices.new_vector (sel, 2, nelt); 6083 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6084 { 6085 if (dump_enabled_p ()) 6086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6087 "shift permutation is not supported by target\n"); 6088 return false; 6089 } 6090 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6091 6092 /* Generating permutation constant to select vector from 2. 6093 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6094 for (i = 0; i < nelt / 2; i++) 6095 sel[i] = i; 6096 for (i = nelt / 2; i < nelt; i++) 6097 sel[i] = nelt + i; 6098 indices.new_vector (sel, 2, nelt); 6099 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6100 { 6101 if (dump_enabled_p ()) 6102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6103 "select is not supported by target\n"); 6104 return false; 6105 } 6106 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6107 6108 for (i = 0; i < log_length; i++) 6109 { 6110 for (j = 0; j < length; j += 2) 6111 { 6112 first_vect = dr_chain[j]; 6113 second_vect = dr_chain[j + 1]; 6114 6115 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6116 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6117 first_vect, first_vect, 6118 perm2_mask1); 6119 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6120 vect[0] = data_ref; 6121 6122 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6123 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6124 second_vect, second_vect, 6125 perm2_mask2); 6126 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6127 vect[1] = data_ref; 6128 6129 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6130 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6131 vect[0], vect[1], shift1_mask); 6132 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6133 (*result_chain)[j/2 + length/2] = data_ref; 6134 6135 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6136 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6137 vect[0], vect[1], select_mask); 6138 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6139 (*result_chain)[j/2] = data_ref; 6140 } 6141 memcpy (dr_chain.address (), result_chain->address (), 6142 length * sizeof (tree)); 6143 } 6144 return true; 6145 } 6146 if (length == 3 && vf > 2) 6147 { 6148 unsigned int k = 0, l = 0; 6149 6150 /* Generating permutation constant to get all elements in rigth order. 6151 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6152 for (i = 0; i < nelt; i++) 6153 { 6154 if (3 * k + (l % 3) >= nelt) 6155 { 6156 k = 0; 6157 l += (3 - (nelt % 3)); 6158 } 6159 sel[i] = 3 * k + (l % 3); 6160 k++; 6161 } 6162 vec_perm_indices indices (sel, 2, nelt); 6163 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6164 { 6165 if (dump_enabled_p ()) 6166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6167 "shuffle of 3 fields structure is not \ 6168 supported by target\n"); 6169 return false; 6170 } 6171 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6172 6173 /* Generating permutation constant to shift all elements. 6174 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6175 for (i = 0; i < nelt; i++) 6176 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6177 indices.new_vector (sel, 2, nelt); 6178 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6179 { 6180 if (dump_enabled_p ()) 6181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6182 "shift permutation is not supported by target\n"); 6183 return false; 6184 } 6185 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6186 6187 /* Generating permutation constant to shift all elements. 6188 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6189 for (i = 0; i < nelt; i++) 6190 sel[i] = 2 * (nelt / 3) + 1 + i; 6191 indices.new_vector (sel, 2, nelt); 6192 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6193 { 6194 if (dump_enabled_p ()) 6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6196 "shift permutation is not supported by target\n"); 6197 return false; 6198 } 6199 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6200 6201 /* Generating permutation constant to shift all elements. 6202 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6203 for (i = 0; i < nelt; i++) 6204 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6205 indices.new_vector (sel, 2, nelt); 6206 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6207 { 6208 if (dump_enabled_p ()) 6209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6210 "shift permutation is not supported by target\n"); 6211 return false; 6212 } 6213 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6214 6215 /* Generating permutation constant to shift all elements. 6216 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6217 for (i = 0; i < nelt; i++) 6218 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6219 indices.new_vector (sel, 2, nelt); 6220 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6221 { 6222 if (dump_enabled_p ()) 6223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6224 "shift permutation is not supported by target\n"); 6225 return false; 6226 } 6227 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6228 6229 for (k = 0; k < 3; k++) 6230 { 6231 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6232 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6233 dr_chain[k], dr_chain[k], 6234 perm3_mask); 6235 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6236 vect[k] = data_ref; 6237 } 6238 6239 for (k = 0; k < 3; k++) 6240 { 6241 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6242 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6243 vect[k % 3], vect[(k + 1) % 3], 6244 shift1_mask); 6245 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6246 vect_shift[k] = data_ref; 6247 } 6248 6249 for (k = 0; k < 3; k++) 6250 { 6251 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6252 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6253 vect_shift[(4 - k) % 3], 6254 vect_shift[(3 - k) % 3], 6255 shift2_mask); 6256 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6257 vect[k] = data_ref; 6258 } 6259 6260 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6261 6262 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6263 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6264 vect[0], shift3_mask); 6265 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6266 (*result_chain)[nelt % 3] = data_ref; 6267 6268 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6269 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6270 vect[1], shift4_mask); 6271 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6272 (*result_chain)[0] = data_ref; 6273 return true; 6274 } 6275 return false; 6276 } 6277 6278 /* Function vect_transform_grouped_load. 6279 6280 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6281 to perform their permutation and ascribe the result vectorized statements to 6282 the scalar statements. 6283 */ 6284 6285 void 6286 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain, 6287 int size, gimple_stmt_iterator *gsi) 6288 { 6289 machine_mode mode; 6290 vec<tree> result_chain = vNULL; 6291 6292 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6293 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6294 vectors, that are ready for vector computation. */ 6295 result_chain.create (size); 6296 6297 /* If reassociation width for vector type is 2 or greater target machine can 6298 execute 2 or more vector instructions in parallel. Otherwise try to 6299 get chain for loads group using vect_shift_permute_load_chain. */ 6300 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); 6301 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6302 || pow2p_hwi (size) 6303 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info, 6304 gsi, &result_chain)) 6305 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain); 6306 vect_record_grouped_load_vectors (stmt_info, result_chain); 6307 result_chain.release (); 6308 } 6309 6310 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6311 generated as part of the vectorization of STMT_INFO. Assign the statement 6312 for each vector to the associated scalar statement. */ 6313 6314 void 6315 vect_record_grouped_load_vectors (stmt_vec_info stmt_info, 6316 vec<tree> result_chain) 6317 { 6318 vec_info *vinfo = stmt_info->vinfo; 6319 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); 6320 unsigned int i, gap_count; 6321 tree tmp_data_ref; 6322 6323 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6324 Since we scan the chain starting from it's first node, their order 6325 corresponds the order of data-refs in RESULT_CHAIN. */ 6326 stmt_vec_info next_stmt_info = first_stmt_info; 6327 gap_count = 1; 6328 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6329 { 6330 if (!next_stmt_info) 6331 break; 6332 6333 /* Skip the gaps. Loads created for the gaps will be removed by dead 6334 code elimination pass later. No need to check for the first stmt in 6335 the group, since it always exists. 6336 DR_GROUP_GAP is the number of steps in elements from the previous 6337 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that 6338 correspond to the gaps. */ 6339 if (next_stmt_info != first_stmt_info 6340 && gap_count < DR_GROUP_GAP (next_stmt_info)) 6341 { 6342 gap_count++; 6343 continue; 6344 } 6345 6346 /* ??? The following needs cleanup after the removal of 6347 DR_GROUP_SAME_DR_STMT. */ 6348 if (next_stmt_info) 6349 { 6350 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref); 6351 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6352 copies, and we put the new vector statement in the first available 6353 RELATED_STMT. */ 6354 if (!STMT_VINFO_VEC_STMT (next_stmt_info)) 6355 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info; 6356 else 6357 { 6358 stmt_vec_info prev_stmt_info 6359 = STMT_VINFO_VEC_STMT (next_stmt_info); 6360 stmt_vec_info rel_stmt_info 6361 = STMT_VINFO_RELATED_STMT (prev_stmt_info); 6362 while (rel_stmt_info) 6363 { 6364 prev_stmt_info = rel_stmt_info; 6365 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info); 6366 } 6367 6368 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; 6369 } 6370 6371 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); 6372 gap_count = 1; 6373 } 6374 } 6375 } 6376 6377 /* Function vect_force_dr_alignment_p. 6378 6379 Returns whether the alignment of a DECL can be forced to be aligned 6380 on ALIGNMENT bit boundary. */ 6381 6382 bool 6383 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) 6384 { 6385 if (!VAR_P (decl)) 6386 return false; 6387 6388 if (decl_in_symtab_p (decl) 6389 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6390 return false; 6391 6392 if (TREE_STATIC (decl)) 6393 return (known_le (alignment, 6394 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); 6395 else 6396 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); 6397 } 6398 6399 6400 /* Return whether the data reference DR_INFO is supported with respect to its 6401 alignment. 6402 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6403 it is aligned, i.e., check if it is possible to vectorize it with different 6404 alignment. */ 6405 6406 enum dr_alignment_support 6407 vect_supportable_dr_alignment (dr_vec_info *dr_info, 6408 bool check_aligned_accesses) 6409 { 6410 data_reference *dr = dr_info->dr; 6411 stmt_vec_info stmt_info = dr_info->stmt; 6412 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6413 machine_mode mode = TYPE_MODE (vectype); 6414 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6415 struct loop *vect_loop = NULL; 6416 bool nested_in_vect_loop = false; 6417 6418 if (aligned_access_p (dr_info) && !check_aligned_accesses) 6419 return dr_aligned; 6420 6421 /* For now assume all conditional loads/stores support unaligned 6422 access without any special code. */ 6423 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 6424 if (gimple_call_internal_p (stmt) 6425 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6426 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6427 return dr_unaligned_supported; 6428 6429 if (loop_vinfo) 6430 { 6431 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6432 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info); 6433 } 6434 6435 /* Possibly unaligned access. */ 6436 6437 /* We can choose between using the implicit realignment scheme (generating 6438 a misaligned_move stmt) and the explicit realignment scheme (generating 6439 aligned loads with a REALIGN_LOAD). There are two variants to the 6440 explicit realignment scheme: optimized, and unoptimized. 6441 We can optimize the realignment only if the step between consecutive 6442 vector loads is equal to the vector size. Since the vector memory 6443 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6444 is guaranteed that the misalignment amount remains the same throughout the 6445 execution of the vectorized loop. Therefore, we can create the 6446 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6447 at the loop preheader. 6448 6449 However, in the case of outer-loop vectorization, when vectorizing a 6450 memory access in the inner-loop nested within the LOOP that is now being 6451 vectorized, while it is guaranteed that the misalignment of the 6452 vectorized memory access will remain the same in different outer-loop 6453 iterations, it is *not* guaranteed that is will remain the same throughout 6454 the execution of the inner-loop. This is because the inner-loop advances 6455 with the original scalar step (and not in steps of VS). If the inner-loop 6456 step happens to be a multiple of VS, then the misalignment remains fixed 6457 and we can use the optimized realignment scheme. For example: 6458 6459 for (i=0; i<N; i++) 6460 for (j=0; j<M; j++) 6461 s += a[i+j]; 6462 6463 When vectorizing the i-loop in the above example, the step between 6464 consecutive vector loads is 1, and so the misalignment does not remain 6465 fixed across the execution of the inner-loop, and the realignment cannot 6466 be optimized (as illustrated in the following pseudo vectorized loop): 6467 6468 for (i=0; i<N; i+=4) 6469 for (j=0; j<M; j++){ 6470 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6471 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6472 // (assuming that we start from an aligned address). 6473 } 6474 6475 We therefore have to use the unoptimized realignment scheme: 6476 6477 for (i=0; i<N; i+=4) 6478 for (j=k; j<M; j+=4) 6479 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6480 // that the misalignment of the initial address is 6481 // 0). 6482 6483 The loop can then be vectorized as follows: 6484 6485 for (k=0; k<4; k++){ 6486 rt = get_realignment_token (&vp[k]); 6487 for (i=0; i<N; i+=4){ 6488 v1 = vp[i+k]; 6489 for (j=k; j<M; j+=4){ 6490 v2 = vp[i+j+VS-1]; 6491 va = REALIGN_LOAD <v1,v2,rt>; 6492 vs += va; 6493 v1 = v2; 6494 } 6495 } 6496 } */ 6497 6498 if (DR_IS_READ (dr)) 6499 { 6500 bool is_packed = false; 6501 tree type = (TREE_TYPE (DR_REF (dr))); 6502 6503 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6504 && (!targetm.vectorize.builtin_mask_for_load 6505 || targetm.vectorize.builtin_mask_for_load ())) 6506 { 6507 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6508 6509 /* If we are doing SLP then the accesses need not have the 6510 same alignment, instead it depends on the SLP group size. */ 6511 if (loop_vinfo 6512 && STMT_SLP_TYPE (stmt_info) 6513 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6514 * (DR_GROUP_SIZE 6515 (DR_GROUP_FIRST_ELEMENT (stmt_info))), 6516 TYPE_VECTOR_SUBPARTS (vectype))) 6517 ; 6518 else if (!loop_vinfo 6519 || (nested_in_vect_loop 6520 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6521 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6522 return dr_explicit_realign; 6523 else 6524 return dr_explicit_realign_optimized; 6525 } 6526 if (!known_alignment_for_access_p (dr_info)) 6527 is_packed = not_size_aligned (DR_REF (dr)); 6528 6529 if (targetm.vectorize.support_vector_misalignment 6530 (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) 6531 /* Can't software pipeline the loads, but can at least do them. */ 6532 return dr_unaligned_supported; 6533 } 6534 else 6535 { 6536 bool is_packed = false; 6537 tree type = (TREE_TYPE (DR_REF (dr))); 6538 6539 if (!known_alignment_for_access_p (dr_info)) 6540 is_packed = not_size_aligned (DR_REF (dr)); 6541 6542 if (targetm.vectorize.support_vector_misalignment 6543 (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) 6544 return dr_unaligned_supported; 6545 } 6546 6547 /* Unsupported. */ 6548 return dr_unaligned_unsupported; 6549 } 6550