1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "tree-cfg.h" 54 #include "tree-hash-traits.h" 55 #include "vec-perm-indices.h" 56 #include "internal-fn.h" 57 58 /* Return true if load- or store-lanes optab OPTAB is implemented for 59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 60 61 static bool 62 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 63 tree vectype, unsigned HOST_WIDE_INT count) 64 { 65 machine_mode mode, array_mode; 66 bool limit_p; 67 68 mode = TYPE_MODE (vectype); 69 if (!targetm.array_mode (mode, count).exists (&array_mode)) 70 { 71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 72 limit_p = !targetm.array_mode_supported_p (mode, count); 73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 74 { 75 if (dump_enabled_p ()) 76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 77 "no array mode for %s[" 78 HOST_WIDE_INT_PRINT_DEC "]\n", 79 GET_MODE_NAME (mode), count); 80 return false; 81 } 82 } 83 84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 85 { 86 if (dump_enabled_p ()) 87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 88 "cannot use %s<%s><%s>\n", name, 89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 90 return false; 91 } 92 93 if (dump_enabled_p ()) 94 dump_printf_loc (MSG_NOTE, vect_location, 95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 96 GET_MODE_NAME (mode)); 97 98 return true; 99 } 100 101 102 /* Return the smallest scalar part of STMT. 103 This is used to determine the vectype of the stmt. We generally set the 104 vectype according to the type of the result (lhs). For stmts whose 105 result-type is different than the type of the arguments (e.g., demotion, 106 promotion), vectype will be reset appropriately (later). Note that we have 107 to visit the smallest datatype in this function, because that determines the 108 VF. If the smallest datatype in the loop is present only as the rhs of a 109 promotion operation - we'd miss it. 110 Such a case, where a variable of this datatype does not appear in the lhs 111 anywhere in the loop, can only occur if it's an invariant: e.g.: 112 'int_x = (int) short_inv', which we'd expect to have been optimized away by 113 invariant motion. However, we cannot rely on invariant motion to always 114 take invariants out of the loop, and so in the case of promotion we also 115 have to check the rhs. 116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 117 types. */ 118 119 tree 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 121 HOST_WIDE_INT *rhs_size_unit) 122 { 123 tree scalar_type = gimple_expr_type (stmt); 124 HOST_WIDE_INT lhs, rhs; 125 126 /* During the analysis phase, this function is called on arbitrary 127 statements that might not have scalar results. */ 128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 129 return scalar_type; 130 131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 132 133 if (is_gimple_assign (stmt) 134 && (gimple_assign_cast_p (stmt) 135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR 136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR 137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 140 { 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 142 143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 144 if (rhs < lhs) 145 scalar_type = rhs_type; 146 } 147 else if (gcall *call = dyn_cast <gcall *> (stmt)) 148 { 149 unsigned int i = 0; 150 if (gimple_call_internal_p (call)) 151 { 152 internal_fn ifn = gimple_call_internal_fn (call); 153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) 154 /* gimple_expr_type already picked the type of the loaded 155 or stored data. */ 156 i = ~0U; 157 else if (internal_fn_mask_index (ifn) == 0) 158 i = 1; 159 } 160 if (i < gimple_call_num_args (call)) 161 { 162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); 163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) 164 { 165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 166 if (rhs < lhs) 167 scalar_type = rhs_type; 168 } 169 } 170 } 171 172 *lhs_size_unit = lhs; 173 *rhs_size_unit = rhs; 174 return scalar_type; 175 } 176 177 178 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 179 tested at run-time. Return TRUE if DDR was successfully inserted. 180 Return false if versioning is not supported. */ 181 182 static bool 183 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 184 { 185 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 186 187 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 188 return false; 189 190 if (!runtime_alias_check_p (ddr, loop, 191 optimize_loop_nest_for_speed_p (loop))) 192 return false; 193 194 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 195 return true; 196 } 197 198 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 199 200 static void 201 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 202 { 203 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 204 for (unsigned int i = 0; i < checks.length(); ++i) 205 if (checks[i] == value) 206 return; 207 208 if (dump_enabled_p ()) 209 { 210 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that "); 211 dump_generic_expr (MSG_NOTE, TDF_SLIM, value); 212 dump_printf (MSG_NOTE, " is nonzero\n"); 213 } 214 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 215 } 216 217 /* Return true if we know that the order of vectorized STMT_A and 218 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 219 At least one of the statements is a write. */ 220 221 static bool 222 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 223 { 224 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 225 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 226 227 /* Single statements are always kept in their original order. */ 228 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 229 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 230 return true; 231 232 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 233 SLP group are emitted at the position of the last scalar load and 234 all loads in an interleaving group are emitted at the position 235 of the first scalar load. 236 Stores in a group are emitted at the position of the last scalar store. 237 Compute that position and check whether the resulting order matches 238 the current one. 239 We have not yet decided between SLP and interleaving so we have 240 to conservatively assume both. */ 241 gimple *il_a; 242 gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 243 if (last_a) 244 { 245 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; 246 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 247 last_a = get_later_stmt (last_a, s); 248 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) 249 { 250 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s; 251 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 252 if (get_later_stmt (il_a, s) == il_a) 253 il_a = s; 254 } 255 else 256 il_a = last_a; 257 } 258 else 259 last_a = il_a = stmt_a; 260 gimple *il_b; 261 gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 262 if (last_b) 263 { 264 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; 265 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 266 last_b = get_later_stmt (last_b, s); 267 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) 268 { 269 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s; 270 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 271 if (get_later_stmt (il_b, s) == il_b) 272 il_b = s; 273 } 274 else 275 il_b = last_b; 276 } 277 else 278 last_b = il_b = stmt_b; 279 bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a); 280 return (/* SLP */ 281 (get_later_stmt (last_a, last_b) == last_a) == a_after_b 282 /* Interleaving */ 283 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b 284 /* Mixed */ 285 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b 286 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); 287 } 288 289 /* A subroutine of vect_analyze_data_ref_dependence. Handle 290 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 291 distances. These distances are conservatively correct but they don't 292 reflect a guaranteed dependence. 293 294 Return true if this function does all the work necessary to avoid 295 an alias or false if the caller should use the dependence distances 296 to limit the vectorization factor in the usual way. LOOP_DEPTH is 297 the depth of the loop described by LOOP_VINFO and the other arguments 298 are as for vect_analyze_data_ref_dependence. */ 299 300 static bool 301 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 302 loop_vec_info loop_vinfo, 303 int loop_depth, unsigned int *max_vf) 304 { 305 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 306 lambda_vector dist_v; 307 unsigned int i; 308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 309 { 310 int dist = dist_v[loop_depth]; 311 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 312 { 313 /* If the user asserted safelen >= DIST consecutive iterations 314 can be executed concurrently, assume independence. 315 316 ??? An alternative would be to add the alias check even 317 in this case, and vectorize the fallback loop with the 318 maximum VF set to safelen. However, if the user has 319 explicitly given a length, it's less likely that that 320 would be a win. */ 321 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 322 { 323 if ((unsigned int) loop->safelen < *max_vf) 324 *max_vf = loop->safelen; 325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 326 continue; 327 } 328 329 /* For dependence distances of 2 or more, we have the option 330 of limiting VF or checking for an alias at runtime. 331 Prefer to check at runtime if we can, to avoid limiting 332 the VF unnecessarily when the bases are in fact independent. 333 334 Note that the alias checks will be removed if the VF ends up 335 being small enough. */ 336 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 337 } 338 } 339 return true; 340 } 341 342 343 /* Function vect_analyze_data_ref_dependence. 344 345 Return TRUE if there (might) exist a dependence between a memory-reference 346 DRA and a memory-reference DRB. When versioning for alias may check a 347 dependence at run-time, return FALSE. Adjust *MAX_VF according to 348 the data dependence. */ 349 350 static bool 351 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 352 loop_vec_info loop_vinfo, 353 unsigned int *max_vf) 354 { 355 unsigned int i; 356 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 357 struct data_reference *dra = DDR_A (ddr); 358 struct data_reference *drb = DDR_B (ddr); 359 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 360 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 361 lambda_vector dist_v; 362 unsigned int loop_depth; 363 364 /* In loop analysis all data references should be vectorizable. */ 365 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 366 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 367 gcc_unreachable (); 368 369 /* Independent data accesses. */ 370 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 371 return false; 372 373 if (dra == drb 374 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 375 return false; 376 377 /* We do not have to consider dependences between accesses that belong 378 to the same group, unless the stride could be smaller than the 379 group size. */ 380 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 381 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b) 382 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 383 return false; 384 385 /* Even if we have an anti-dependence then, as the vectorized loop covers at 386 least two scalar iterations, there is always also a true dependence. 387 As the vectorizer does not re-order loads and stores we can ignore 388 the anti-dependence if TBAA can disambiguate both DRs similar to the 389 case with known negative distance anti-dependences (positive 390 distance anti-dependences would violate TBAA constraints). */ 391 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 392 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 393 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 394 get_alias_set (DR_REF (drb)))) 395 return false; 396 397 /* Unknown data dependence. */ 398 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 399 { 400 /* If user asserted safelen consecutive iterations can be 401 executed concurrently, assume independence. */ 402 if (loop->safelen >= 2) 403 { 404 if ((unsigned int) loop->safelen < *max_vf) 405 *max_vf = loop->safelen; 406 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 407 return false; 408 } 409 410 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 411 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 412 { 413 if (dump_enabled_p ()) 414 { 415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 416 "versioning for alias not supported for: " 417 "can't determine dependence between "); 418 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 419 DR_REF (dra)); 420 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 421 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 422 DR_REF (drb)); 423 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 424 } 425 return true; 426 } 427 428 if (dump_enabled_p ()) 429 { 430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 431 "versioning for alias required: " 432 "can't determine dependence between "); 433 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 434 DR_REF (dra)); 435 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 437 DR_REF (drb)); 438 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 439 } 440 441 /* Add to list of ddrs that need to be tested at run-time. */ 442 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 443 } 444 445 /* Known data dependence. */ 446 if (DDR_NUM_DIST_VECTS (ddr) == 0) 447 { 448 /* If user asserted safelen consecutive iterations can be 449 executed concurrently, assume independence. */ 450 if (loop->safelen >= 2) 451 { 452 if ((unsigned int) loop->safelen < *max_vf) 453 *max_vf = loop->safelen; 454 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 455 return false; 456 } 457 458 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 459 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 460 { 461 if (dump_enabled_p ()) 462 { 463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 464 "versioning for alias not supported for: " 465 "bad dist vector for "); 466 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 467 DR_REF (dra)); 468 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 469 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 470 DR_REF (drb)); 471 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 472 } 473 return true; 474 } 475 476 if (dump_enabled_p ()) 477 { 478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 479 "versioning for alias required: " 480 "bad dist vector for "); 481 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 482 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 483 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 484 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 485 } 486 /* Add to list of ddrs that need to be tested at run-time. */ 487 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 488 } 489 490 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 491 492 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 493 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 494 loop_depth, max_vf)) 495 return false; 496 497 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 498 { 499 int dist = dist_v[loop_depth]; 500 501 if (dump_enabled_p ()) 502 dump_printf_loc (MSG_NOTE, vect_location, 503 "dependence distance = %d.\n", dist); 504 505 if (dist == 0) 506 { 507 if (dump_enabled_p ()) 508 { 509 dump_printf_loc (MSG_NOTE, vect_location, 510 "dependence distance == 0 between "); 511 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 512 dump_printf (MSG_NOTE, " and "); 513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 514 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 515 } 516 517 /* When we perform grouped accesses and perform implicit CSE 518 by detecting equal accesses and doing disambiguation with 519 runtime alias tests like for 520 .. = a[i]; 521 .. = a[i+1]; 522 a[i] = ..; 523 a[i+1] = ..; 524 *p = ..; 525 .. = a[i]; 526 .. = a[i+1]; 527 where we will end up loading { a[i], a[i+1] } once, make 528 sure that inserting group loads before the first load and 529 stores after the last store will do the right thing. 530 Similar for groups like 531 a[i] = ...; 532 ... = a[i]; 533 a[i+1] = ...; 534 where loads from the group interleave with the store. */ 535 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 536 { 537 if (dump_enabled_p ()) 538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 539 "READ_WRITE dependence in interleaving.\n"); 540 return true; 541 } 542 543 if (loop->safelen < 2) 544 { 545 tree indicator = dr_zero_step_indicator (dra); 546 if (TREE_CODE (indicator) != INTEGER_CST) 547 vect_check_nonzero_value (loop_vinfo, indicator); 548 else if (integer_zerop (indicator)) 549 { 550 if (dump_enabled_p ()) 551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 552 "access also has a zero step\n"); 553 return true; 554 } 555 } 556 continue; 557 } 558 559 if (dist > 0 && DDR_REVERSED_P (ddr)) 560 { 561 /* If DDR_REVERSED_P the order of the data-refs in DDR was 562 reversed (to make distance vector positive), and the actual 563 distance is negative. */ 564 if (dump_enabled_p ()) 565 dump_printf_loc (MSG_NOTE, vect_location, 566 "dependence distance negative.\n"); 567 /* When doing outer loop vectorization, we need to check if there is 568 a backward dependence at the inner loop level if the dependence 569 at the outer loop is reversed. See PR81740. */ 570 if (nested_in_vect_loop_p (loop, DR_STMT (dra)) 571 || nested_in_vect_loop_p (loop, DR_STMT (drb))) 572 { 573 unsigned inner_depth = index_in_loop_nest (loop->inner->num, 574 DDR_LOOP_NEST (ddr)); 575 if (dist_v[inner_depth] < 0) 576 return true; 577 } 578 /* Record a negative dependence distance to later limit the 579 amount of stmt copying / unrolling we can perform. 580 Only need to handle read-after-write dependence. */ 581 if (DR_IS_READ (drb) 582 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 583 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 584 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 585 continue; 586 } 587 588 unsigned int abs_dist = abs (dist); 589 if (abs_dist >= 2 && abs_dist < *max_vf) 590 { 591 /* The dependence distance requires reduction of the maximal 592 vectorization factor. */ 593 *max_vf = abs_dist; 594 if (dump_enabled_p ()) 595 dump_printf_loc (MSG_NOTE, vect_location, 596 "adjusting maximal vectorization factor to %i\n", 597 *max_vf); 598 } 599 600 if (abs_dist >= *max_vf) 601 { 602 /* Dependence distance does not create dependence, as far as 603 vectorization is concerned, in this case. */ 604 if (dump_enabled_p ()) 605 dump_printf_loc (MSG_NOTE, vect_location, 606 "dependence distance >= VF.\n"); 607 continue; 608 } 609 610 if (dump_enabled_p ()) 611 { 612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 613 "not vectorized, possible dependence " 614 "between data-refs "); 615 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 616 dump_printf (MSG_NOTE, " and "); 617 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 618 dump_printf (MSG_NOTE, "\n"); 619 } 620 621 return true; 622 } 623 624 return false; 625 } 626 627 /* Function vect_analyze_data_ref_dependences. 628 629 Examine all the data references in the loop, and make sure there do not 630 exist any data dependences between them. Set *MAX_VF according to 631 the maximum vectorization factor the data dependences allow. */ 632 633 bool 634 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 635 unsigned int *max_vf) 636 { 637 unsigned int i; 638 struct data_dependence_relation *ddr; 639 640 if (dump_enabled_p ()) 641 dump_printf_loc (MSG_NOTE, vect_location, 642 "=== vect_analyze_data_ref_dependences ===\n"); 643 644 LOOP_VINFO_DDRS (loop_vinfo) 645 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 646 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 647 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 648 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 649 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 650 &LOOP_VINFO_DDRS (loop_vinfo), 651 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 652 return false; 653 654 /* For epilogues we either have no aliases or alias versioning 655 was applied to original loop. Therefore we may just get max_vf 656 using VF of original loop. */ 657 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 658 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 659 else 660 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 661 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 662 return false; 663 664 return true; 665 } 666 667 668 /* Function vect_slp_analyze_data_ref_dependence. 669 670 Return TRUE if there (might) exist a dependence between a memory-reference 671 DRA and a memory-reference DRB. When versioning for alias may check a 672 dependence at run-time, return FALSE. Adjust *MAX_VF according to 673 the data dependence. */ 674 675 static bool 676 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 677 { 678 struct data_reference *dra = DDR_A (ddr); 679 struct data_reference *drb = DDR_B (ddr); 680 681 /* We need to check dependences of statements marked as unvectorizable 682 as well, they still can prohibit vectorization. */ 683 684 /* Independent data accesses. */ 685 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 686 return false; 687 688 if (dra == drb) 689 return false; 690 691 /* Read-read is OK. */ 692 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 693 return false; 694 695 /* If dra and drb are part of the same interleaving chain consider 696 them independent. */ 697 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 698 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 699 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 700 return false; 701 702 /* Unknown data dependence. */ 703 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 704 { 705 if (dump_enabled_p ()) 706 { 707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 708 "can't determine dependence between "); 709 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 710 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 711 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 712 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 713 } 714 } 715 else if (dump_enabled_p ()) 716 { 717 dump_printf_loc (MSG_NOTE, vect_location, 718 "determined dependence between "); 719 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 720 dump_printf (MSG_NOTE, " and "); 721 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 722 dump_printf (MSG_NOTE, "\n"); 723 } 724 725 return true; 726 } 727 728 729 /* Analyze dependences involved in the transform of SLP NODE. STORES 730 contain the vector of scalar stores of this instance if we are 731 disambiguating the loads. */ 732 733 static bool 734 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 735 vec<gimple *> stores, gimple *last_store) 736 { 737 /* This walks over all stmts involved in the SLP load/store done 738 in NODE verifying we can sink them up to the last stmt in the 739 group. */ 740 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 741 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 742 { 743 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 744 if (access == last_access) 745 continue; 746 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 747 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 748 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 749 { 750 gimple *stmt = gsi_stmt (gsi); 751 if (! gimple_vuse (stmt) 752 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 753 continue; 754 755 /* If we couldn't record a (single) data reference for this 756 stmt we have to give up. */ 757 /* ??? Here and below if dependence analysis fails we can resort 758 to the alias oracle which can handle more kinds of stmts. */ 759 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 760 if (!dr_b) 761 return false; 762 763 bool dependent = false; 764 /* If we run into a store of this same instance (we've just 765 marked those) then delay dependence checking until we run 766 into the last store because this is where it will have 767 been sunk to (and we verify if we can do that as well). */ 768 if (gimple_visited_p (stmt)) 769 { 770 if (stmt != last_store) 771 continue; 772 unsigned i; 773 gimple *store; 774 FOR_EACH_VEC_ELT (stores, i, store) 775 { 776 data_reference *store_dr 777 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 778 ddr_p ddr = initialize_data_dependence_relation 779 (dr_a, store_dr, vNULL); 780 dependent = vect_slp_analyze_data_ref_dependence (ddr); 781 free_dependence_relation (ddr); 782 if (dependent) 783 break; 784 } 785 } 786 else 787 { 788 ddr_p ddr = initialize_data_dependence_relation (dr_a, 789 dr_b, vNULL); 790 dependent = vect_slp_analyze_data_ref_dependence (ddr); 791 free_dependence_relation (ddr); 792 } 793 if (dependent) 794 return false; 795 } 796 } 797 return true; 798 } 799 800 801 /* Function vect_analyze_data_ref_dependences. 802 803 Examine all the data references in the basic-block, and make sure there 804 do not exist any data dependences between them. Set *MAX_VF according to 805 the maximum vectorization factor the data dependences allow. */ 806 807 bool 808 vect_slp_analyze_instance_dependence (slp_instance instance) 809 { 810 if (dump_enabled_p ()) 811 dump_printf_loc (MSG_NOTE, vect_location, 812 "=== vect_slp_analyze_instance_dependence ===\n"); 813 814 /* The stores of this instance are at the root of the SLP tree. */ 815 slp_tree store = SLP_INSTANCE_TREE (instance); 816 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 817 store = NULL; 818 819 /* Verify we can sink stores to the vectorized stmt insert location. */ 820 gimple *last_store = NULL; 821 if (store) 822 { 823 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 824 return false; 825 826 /* Mark stores in this instance and remember the last one. */ 827 last_store = vect_find_last_scalar_stmt_in_slp (store); 828 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 829 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 830 } 831 832 bool res = true; 833 834 /* Verify we can sink loads to the vectorized stmt insert location, 835 special-casing stores of this instance. */ 836 slp_tree load; 837 unsigned int i; 838 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 839 if (! vect_slp_analyze_node_dependences (instance, load, 840 store 841 ? SLP_TREE_SCALAR_STMTS (store) 842 : vNULL, last_store)) 843 { 844 res = false; 845 break; 846 } 847 848 /* Unset the visited flag. */ 849 if (store) 850 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 851 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 852 853 return res; 854 } 855 856 /* Record in VINFO the base alignment guarantee given by DRB. STMT is 857 the statement that contains DRB, which is useful for recording in the 858 dump file. */ 859 860 static void 861 vect_record_base_alignment (vec_info *vinfo, gimple *stmt, 862 innermost_loop_behavior *drb) 863 { 864 bool existed; 865 innermost_loop_behavior *&entry 866 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 867 if (!existed || entry->base_alignment < drb->base_alignment) 868 { 869 entry = drb; 870 if (dump_enabled_p ()) 871 { 872 dump_printf_loc (MSG_NOTE, vect_location, 873 "recording new base alignment for "); 874 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address); 875 dump_printf (MSG_NOTE, "\n"); 876 dump_printf_loc (MSG_NOTE, vect_location, 877 " alignment: %d\n", drb->base_alignment); 878 dump_printf_loc (MSG_NOTE, vect_location, 879 " misalignment: %d\n", drb->base_misalignment); 880 dump_printf_loc (MSG_NOTE, vect_location, 881 " based on: "); 882 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 883 } 884 } 885 } 886 887 /* If the region we're going to vectorize is reached, all unconditional 888 data references occur at least once. We can therefore pool the base 889 alignment guarantees from each unconditional reference. Do this by 890 going through all the data references in VINFO and checking whether 891 the containing statement makes the reference unconditionally. If so, 892 record the alignment of the base address in VINFO so that it can be 893 used for all other references with the same base. */ 894 895 void 896 vect_record_base_alignments (vec_info *vinfo) 897 { 898 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 899 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 900 data_reference *dr; 901 unsigned int i; 902 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr) 903 if (!DR_IS_CONDITIONAL_IN_STMT (dr)) 904 { 905 gimple *stmt = DR_STMT (dr); 906 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr)); 907 908 /* If DR is nested in the loop that is being vectorized, we can also 909 record the alignment of the base wrt the outer loop. */ 910 if (loop && nested_in_vect_loop_p (loop, stmt)) 911 { 912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 913 vect_record_base_alignment 914 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 915 } 916 } 917 } 918 919 /* Return the target alignment for the vectorized form of DR. */ 920 921 static unsigned int 922 vect_calculate_target_alignment (struct data_reference *dr) 923 { 924 gimple *stmt = DR_STMT (dr); 925 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 926 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 927 return targetm.vectorize.preferred_vector_alignment (vectype); 928 } 929 930 /* Function vect_compute_data_ref_alignment 931 932 Compute the misalignment of the data reference DR. 933 934 Output: 935 1. If during the misalignment computation it is found that the data reference 936 cannot be vectorized then false is returned. 937 2. DR_MISALIGNMENT (DR) is defined. 938 939 FOR NOW: No analysis is actually performed. Misalignment is calculated 940 only for trivial cases. TODO. */ 941 942 bool 943 vect_compute_data_ref_alignment (struct data_reference *dr) 944 { 945 gimple *stmt = DR_STMT (dr); 946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 947 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 948 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 949 struct loop *loop = NULL; 950 tree ref = DR_REF (dr); 951 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 952 953 if (dump_enabled_p ()) 954 dump_printf_loc (MSG_NOTE, vect_location, 955 "vect_compute_data_ref_alignment:\n"); 956 957 if (loop_vinfo) 958 loop = LOOP_VINFO_LOOP (loop_vinfo); 959 960 /* Initialize misalignment to unknown. */ 961 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 962 963 innermost_loop_behavior *drb = vect_dr_behavior (dr); 964 bool step_preserves_misalignment_p; 965 966 unsigned HOST_WIDE_INT vector_alignment 967 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT; 968 DR_TARGET_ALIGNMENT (dr) = vector_alignment; 969 970 /* No step for BB vectorization. */ 971 if (!loop) 972 { 973 gcc_assert (integer_zerop (drb->step)); 974 step_preserves_misalignment_p = true; 975 } 976 977 /* In case the dataref is in an inner-loop of the loop that is being 978 vectorized (LOOP), we use the base and misalignment information 979 relative to the outer-loop (LOOP). This is ok only if the misalignment 980 stays the same throughout the execution of the inner-loop, which is why 981 we have to check that the stride of the dataref in the inner-loop evenly 982 divides by the vector alignment. */ 983 else if (nested_in_vect_loop_p (loop, stmt)) 984 { 985 step_preserves_misalignment_p 986 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0; 987 988 if (dump_enabled_p ()) 989 { 990 if (step_preserves_misalignment_p) 991 dump_printf_loc (MSG_NOTE, vect_location, 992 "inner step divides the vector alignment.\n"); 993 else 994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 995 "inner step doesn't divide the vector" 996 " alignment.\n"); 997 } 998 } 999 1000 /* Similarly we can only use base and misalignment information relative to 1001 an innermost loop if the misalignment stays the same throughout the 1002 execution of the loop. As above, this is the case if the stride of 1003 the dataref evenly divides by the alignment. */ 1004 else 1005 { 1006 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1007 step_preserves_misalignment_p 1008 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment); 1009 1010 if (!step_preserves_misalignment_p && dump_enabled_p ()) 1011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1012 "step doesn't divide the vector alignment.\n"); 1013 } 1014 1015 unsigned int base_alignment = drb->base_alignment; 1016 unsigned int base_misalignment = drb->base_misalignment; 1017 1018 /* Calculate the maximum of the pooled base address alignment and the 1019 alignment that we can compute for DR itself. */ 1020 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 1021 if (entry && base_alignment < (*entry)->base_alignment) 1022 { 1023 base_alignment = (*entry)->base_alignment; 1024 base_misalignment = (*entry)->base_misalignment; 1025 } 1026 1027 if (drb->offset_alignment < vector_alignment 1028 || !step_preserves_misalignment_p 1029 /* We need to know whether the step wrt the vectorized loop is 1030 negative when computing the starting misalignment below. */ 1031 || TREE_CODE (drb->step) != INTEGER_CST) 1032 { 1033 if (dump_enabled_p ()) 1034 { 1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1036 "Unknown alignment for access: "); 1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1038 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1039 } 1040 return true; 1041 } 1042 1043 if (base_alignment < vector_alignment) 1044 { 1045 unsigned int max_alignment; 1046 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 1047 if (max_alignment < vector_alignment 1048 || !vect_can_force_dr_alignment_p (base, 1049 vector_alignment * BITS_PER_UNIT)) 1050 { 1051 if (dump_enabled_p ()) 1052 { 1053 dump_printf_loc (MSG_NOTE, vect_location, 1054 "can't force alignment of ref: "); 1055 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 1056 dump_printf (MSG_NOTE, "\n"); 1057 } 1058 return true; 1059 } 1060 1061 /* Force the alignment of the decl. 1062 NOTE: This is the only change to the code we make during 1063 the analysis phase, before deciding to vectorize the loop. */ 1064 if (dump_enabled_p ()) 1065 { 1066 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 1067 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 1068 dump_printf (MSG_NOTE, "\n"); 1069 } 1070 1071 DR_VECT_AUX (dr)->base_decl = base; 1072 DR_VECT_AUX (dr)->base_misaligned = true; 1073 base_misalignment = 0; 1074 } 1075 poly_int64 misalignment 1076 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 1077 1078 /* If this is a backward running DR then first access in the larger 1079 vectype actually is N-1 elements before the address in the DR. 1080 Adjust misalign accordingly. */ 1081 if (tree_int_cst_sgn (drb->step) < 0) 1082 /* PLUS because STEP is negative. */ 1083 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1084 * TREE_INT_CST_LOW (drb->step)); 1085 1086 unsigned int const_misalignment; 1087 if (!known_misalignment (misalignment, vector_alignment, 1088 &const_misalignment)) 1089 { 1090 if (dump_enabled_p ()) 1091 { 1092 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1093 "Non-constant misalignment for access: "); 1094 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1095 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1096 } 1097 return true; 1098 } 1099 1100 SET_DR_MISALIGNMENT (dr, const_misalignment); 1101 1102 if (dump_enabled_p ()) 1103 { 1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1105 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 1106 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1107 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1108 } 1109 1110 return true; 1111 } 1112 1113 /* Function vect_update_misalignment_for_peel. 1114 Sets DR's misalignment 1115 - to 0 if it has the same alignment as DR_PEEL, 1116 - to the misalignment computed using NPEEL if DR's salignment is known, 1117 - to -1 (unknown) otherwise. 1118 1119 DR - the data reference whose misalignment is to be adjusted. 1120 DR_PEEL - the data reference whose misalignment is being made 1121 zero in the vector loop by the peel. 1122 NPEEL - the number of iterations in the peel loop if the misalignment 1123 of DR_PEEL is known at compile time. */ 1124 1125 static void 1126 vect_update_misalignment_for_peel (struct data_reference *dr, 1127 struct data_reference *dr_peel, int npeel) 1128 { 1129 unsigned int i; 1130 vec<dr_p> same_aligned_drs; 1131 struct data_reference *current_dr; 1132 int dr_size = vect_get_scalar_dr_size (dr); 1133 int dr_peel_size = vect_get_scalar_dr_size (dr_peel); 1134 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1135 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1136 1137 /* For interleaved data accesses the step in the loop must be multiplied by 1138 the size of the interleaving group. */ 1139 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1140 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1141 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1142 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1143 1144 /* It can be assumed that the data refs with the same alignment as dr_peel 1145 are aligned in the vector loop. */ 1146 same_aligned_drs 1147 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1148 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1149 { 1150 if (current_dr != dr) 1151 continue; 1152 gcc_assert (!known_alignment_for_access_p (dr) 1153 || !known_alignment_for_access_p (dr_peel) 1154 || (DR_MISALIGNMENT (dr) / dr_size 1155 == DR_MISALIGNMENT (dr_peel) / dr_peel_size)); 1156 SET_DR_MISALIGNMENT (dr, 0); 1157 return; 1158 } 1159 1160 if (known_alignment_for_access_p (dr) 1161 && known_alignment_for_access_p (dr_peel)) 1162 { 1163 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1164 int misal = DR_MISALIGNMENT (dr); 1165 misal += negative ? -npeel * dr_size : npeel * dr_size; 1166 misal &= DR_TARGET_ALIGNMENT (dr) - 1; 1167 SET_DR_MISALIGNMENT (dr, misal); 1168 return; 1169 } 1170 1171 if (dump_enabled_p ()) 1172 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1173 "to unknown (-1).\n"); 1174 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 1175 } 1176 1177 1178 /* Function verify_data_ref_alignment 1179 1180 Return TRUE if DR can be handled with respect to alignment. */ 1181 1182 static bool 1183 verify_data_ref_alignment (data_reference_p dr) 1184 { 1185 enum dr_alignment_support supportable_dr_alignment 1186 = vect_supportable_dr_alignment (dr, false); 1187 if (!supportable_dr_alignment) 1188 { 1189 if (dump_enabled_p ()) 1190 { 1191 if (DR_IS_READ (dr)) 1192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1193 "not vectorized: unsupported unaligned load."); 1194 else 1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1196 "not vectorized: unsupported unaligned " 1197 "store."); 1198 1199 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1200 DR_REF (dr)); 1201 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1202 } 1203 return false; 1204 } 1205 1206 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1207 dump_printf_loc (MSG_NOTE, vect_location, 1208 "Vectorizing an unaligned access.\n"); 1209 1210 return true; 1211 } 1212 1213 /* Function vect_verify_datarefs_alignment 1214 1215 Return TRUE if all data references in the loop can be 1216 handled with respect to alignment. */ 1217 1218 bool 1219 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1220 { 1221 vec<data_reference_p> datarefs = vinfo->datarefs; 1222 struct data_reference *dr; 1223 unsigned int i; 1224 1225 FOR_EACH_VEC_ELT (datarefs, i, dr) 1226 { 1227 gimple *stmt = DR_STMT (dr); 1228 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1229 1230 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1231 continue; 1232 1233 /* For interleaving, only the alignment of the first access matters. */ 1234 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1235 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1236 continue; 1237 1238 /* Strided accesses perform only component accesses, alignment is 1239 irrelevant for them. */ 1240 if (STMT_VINFO_STRIDED_P (stmt_info) 1241 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1242 continue; 1243 1244 if (! verify_data_ref_alignment (dr)) 1245 return false; 1246 } 1247 1248 return true; 1249 } 1250 1251 /* Given an memory reference EXP return whether its alignment is less 1252 than its size. */ 1253 1254 static bool 1255 not_size_aligned (tree exp) 1256 { 1257 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1258 return true; 1259 1260 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1261 > get_object_alignment (exp)); 1262 } 1263 1264 /* Function vector_alignment_reachable_p 1265 1266 Return true if vector alignment for DR is reachable by peeling 1267 a few loop iterations. Return false otherwise. */ 1268 1269 static bool 1270 vector_alignment_reachable_p (struct data_reference *dr) 1271 { 1272 gimple *stmt = DR_STMT (dr); 1273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1274 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1275 1276 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1277 { 1278 /* For interleaved access we peel only if number of iterations in 1279 the prolog loop ({VF - misalignment}), is a multiple of the 1280 number of the interleaved accesses. */ 1281 int elem_size, mis_in_elements; 1282 1283 /* FORNOW: handle only known alignment. */ 1284 if (!known_alignment_for_access_p (dr)) 1285 return false; 1286 1287 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1288 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1289 elem_size = vector_element_size (vector_size, nelements); 1290 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1291 1292 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info))) 1293 return false; 1294 } 1295 1296 /* If misalignment is known at the compile time then allow peeling 1297 only if natural alignment is reachable through peeling. */ 1298 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1299 { 1300 HOST_WIDE_INT elmsize = 1301 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1302 if (dump_enabled_p ()) 1303 { 1304 dump_printf_loc (MSG_NOTE, vect_location, 1305 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1306 dump_printf (MSG_NOTE, 1307 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1308 } 1309 if (DR_MISALIGNMENT (dr) % elmsize) 1310 { 1311 if (dump_enabled_p ()) 1312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1313 "data size does not divide the misalignment.\n"); 1314 return false; 1315 } 1316 } 1317 1318 if (!known_alignment_for_access_p (dr)) 1319 { 1320 tree type = TREE_TYPE (DR_REF (dr)); 1321 bool is_packed = not_size_aligned (DR_REF (dr)); 1322 if (dump_enabled_p ()) 1323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1324 "Unknown misalignment, %snaturally aligned\n", 1325 is_packed ? "not " : ""); 1326 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1327 } 1328 1329 return true; 1330 } 1331 1332 1333 /* Calculate the cost of the memory access represented by DR. */ 1334 1335 static void 1336 vect_get_data_access_cost (struct data_reference *dr, 1337 unsigned int *inside_cost, 1338 unsigned int *outside_cost, 1339 stmt_vector_for_cost *body_cost_vec) 1340 { 1341 gimple *stmt = DR_STMT (dr); 1342 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1343 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1344 int ncopies; 1345 1346 if (PURE_SLP_STMT (stmt_info)) 1347 ncopies = 1; 1348 else 1349 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1350 1351 if (DR_IS_READ (dr)) 1352 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1353 NULL, body_cost_vec, false); 1354 else 1355 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1356 1357 if (dump_enabled_p ()) 1358 dump_printf_loc (MSG_NOTE, vect_location, 1359 "vect_get_data_access_cost: inside_cost = %d, " 1360 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1361 } 1362 1363 1364 typedef struct _vect_peel_info 1365 { 1366 struct data_reference *dr; 1367 int npeel; 1368 unsigned int count; 1369 } *vect_peel_info; 1370 1371 typedef struct _vect_peel_extended_info 1372 { 1373 struct _vect_peel_info peel_info; 1374 unsigned int inside_cost; 1375 unsigned int outside_cost; 1376 } *vect_peel_extended_info; 1377 1378 1379 /* Peeling hashtable helpers. */ 1380 1381 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1382 { 1383 static inline hashval_t hash (const _vect_peel_info *); 1384 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1385 }; 1386 1387 inline hashval_t 1388 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1389 { 1390 return (hashval_t) peel_info->npeel; 1391 } 1392 1393 inline bool 1394 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1395 { 1396 return (a->npeel == b->npeel); 1397 } 1398 1399 1400 /* Insert DR into peeling hash table with NPEEL as key. */ 1401 1402 static void 1403 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1404 loop_vec_info loop_vinfo, struct data_reference *dr, 1405 int npeel) 1406 { 1407 struct _vect_peel_info elem, *slot; 1408 _vect_peel_info **new_slot; 1409 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1410 1411 elem.npeel = npeel; 1412 slot = peeling_htab->find (&elem); 1413 if (slot) 1414 slot->count++; 1415 else 1416 { 1417 slot = XNEW (struct _vect_peel_info); 1418 slot->npeel = npeel; 1419 slot->dr = dr; 1420 slot->count = 1; 1421 new_slot = peeling_htab->find_slot (slot, INSERT); 1422 *new_slot = slot; 1423 } 1424 1425 if (!supportable_dr_alignment 1426 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1427 slot->count += VECT_MAX_COST; 1428 } 1429 1430 1431 /* Traverse peeling hash table to find peeling option that aligns maximum 1432 number of data accesses. */ 1433 1434 int 1435 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1436 _vect_peel_extended_info *max) 1437 { 1438 vect_peel_info elem = *slot; 1439 1440 if (elem->count > max->peel_info.count 1441 || (elem->count == max->peel_info.count 1442 && max->peel_info.npeel > elem->npeel)) 1443 { 1444 max->peel_info.npeel = elem->npeel; 1445 max->peel_info.count = elem->count; 1446 max->peel_info.dr = elem->dr; 1447 } 1448 1449 return 1; 1450 } 1451 1452 /* Get the costs of peeling NPEEL iterations checking data access costs 1453 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's 1454 misalignment will be zero after peeling. */ 1455 1456 static void 1457 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs, 1458 struct data_reference *dr0, 1459 unsigned int *inside_cost, 1460 unsigned int *outside_cost, 1461 stmt_vector_for_cost *body_cost_vec, 1462 unsigned int npeel, 1463 bool unknown_misalignment) 1464 { 1465 unsigned i; 1466 data_reference *dr; 1467 1468 FOR_EACH_VEC_ELT (datarefs, i, dr) 1469 { 1470 gimple *stmt = DR_STMT (dr); 1471 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1472 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1473 continue; 1474 1475 /* For interleaving, only the alignment of the first access 1476 matters. */ 1477 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1478 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1479 continue; 1480 1481 /* Strided accesses perform only component accesses, alignment is 1482 irrelevant for them. */ 1483 if (STMT_VINFO_STRIDED_P (stmt_info) 1484 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1485 continue; 1486 1487 int save_misalignment; 1488 save_misalignment = DR_MISALIGNMENT (dr); 1489 if (npeel == 0) 1490 ; 1491 else if (unknown_misalignment && dr == dr0) 1492 SET_DR_MISALIGNMENT (dr, 0); 1493 else 1494 vect_update_misalignment_for_peel (dr, dr0, npeel); 1495 vect_get_data_access_cost (dr, inside_cost, outside_cost, 1496 body_cost_vec); 1497 SET_DR_MISALIGNMENT (dr, save_misalignment); 1498 } 1499 } 1500 1501 /* Traverse peeling hash table and calculate cost for each peeling option. 1502 Find the one with the lowest cost. */ 1503 1504 int 1505 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1506 _vect_peel_extended_info *min) 1507 { 1508 vect_peel_info elem = *slot; 1509 int dummy; 1510 unsigned int inside_cost = 0, outside_cost = 0; 1511 gimple *stmt = DR_STMT (elem->dr); 1512 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1513 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1514 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1515 epilogue_cost_vec; 1516 1517 prologue_cost_vec.create (2); 1518 body_cost_vec.create (2); 1519 epilogue_cost_vec.create (2); 1520 1521 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo), 1522 elem->dr, &inside_cost, &outside_cost, 1523 &body_cost_vec, elem->npeel, false); 1524 1525 body_cost_vec.release (); 1526 1527 outside_cost += vect_get_known_peeling_cost 1528 (loop_vinfo, elem->npeel, &dummy, 1529 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1530 &prologue_cost_vec, &epilogue_cost_vec); 1531 1532 /* Prologue and epilogue costs are added to the target model later. 1533 These costs depend only on the scalar iteration cost, the 1534 number of peeling iterations finally chosen, and the number of 1535 misaligned statements. So discard the information found here. */ 1536 prologue_cost_vec.release (); 1537 epilogue_cost_vec.release (); 1538 1539 if (inside_cost < min->inside_cost 1540 || (inside_cost == min->inside_cost 1541 && outside_cost < min->outside_cost)) 1542 { 1543 min->inside_cost = inside_cost; 1544 min->outside_cost = outside_cost; 1545 min->peel_info.dr = elem->dr; 1546 min->peel_info.npeel = elem->npeel; 1547 min->peel_info.count = elem->count; 1548 } 1549 1550 return 1; 1551 } 1552 1553 1554 /* Choose best peeling option by traversing peeling hash table and either 1555 choosing an option with the lowest cost (if cost model is enabled) or the 1556 option that aligns as many accesses as possible. */ 1557 1558 static struct _vect_peel_extended_info 1559 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1560 loop_vec_info loop_vinfo) 1561 { 1562 struct _vect_peel_extended_info res; 1563 1564 res.peel_info.dr = NULL; 1565 1566 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1567 { 1568 res.inside_cost = INT_MAX; 1569 res.outside_cost = INT_MAX; 1570 peeling_htab->traverse <_vect_peel_extended_info *, 1571 vect_peeling_hash_get_lowest_cost> (&res); 1572 } 1573 else 1574 { 1575 res.peel_info.count = 0; 1576 peeling_htab->traverse <_vect_peel_extended_info *, 1577 vect_peeling_hash_get_most_frequent> (&res); 1578 res.inside_cost = 0; 1579 res.outside_cost = 0; 1580 } 1581 1582 return res; 1583 } 1584 1585 /* Return true if the new peeling NPEEL is supported. */ 1586 1587 static bool 1588 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0, 1589 unsigned npeel) 1590 { 1591 unsigned i; 1592 struct data_reference *dr = NULL; 1593 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1594 gimple *stmt; 1595 stmt_vec_info stmt_info; 1596 enum dr_alignment_support supportable_dr_alignment; 1597 1598 /* Ensure that all data refs can be vectorized after the peel. */ 1599 FOR_EACH_VEC_ELT (datarefs, i, dr) 1600 { 1601 int save_misalignment; 1602 1603 if (dr == dr0) 1604 continue; 1605 1606 stmt = DR_STMT (dr); 1607 stmt_info = vinfo_for_stmt (stmt); 1608 /* For interleaving, only the alignment of the first access 1609 matters. */ 1610 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1611 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1612 continue; 1613 1614 /* Strided accesses perform only component accesses, alignment is 1615 irrelevant for them. */ 1616 if (STMT_VINFO_STRIDED_P (stmt_info) 1617 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1618 continue; 1619 1620 save_misalignment = DR_MISALIGNMENT (dr); 1621 vect_update_misalignment_for_peel (dr, dr0, npeel); 1622 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1623 SET_DR_MISALIGNMENT (dr, save_misalignment); 1624 1625 if (!supportable_dr_alignment) 1626 return false; 1627 } 1628 1629 return true; 1630 } 1631 1632 /* Function vect_enhance_data_refs_alignment 1633 1634 This pass will use loop versioning and loop peeling in order to enhance 1635 the alignment of data references in the loop. 1636 1637 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1638 original loop is to be vectorized. Any other loops that are created by 1639 the transformations performed in this pass - are not supposed to be 1640 vectorized. This restriction will be relaxed. 1641 1642 This pass will require a cost model to guide it whether to apply peeling 1643 or versioning or a combination of the two. For example, the scheme that 1644 intel uses when given a loop with several memory accesses, is as follows: 1645 choose one memory access ('p') which alignment you want to force by doing 1646 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1647 other accesses are not necessarily aligned, or (2) use loop versioning to 1648 generate one loop in which all accesses are aligned, and another loop in 1649 which only 'p' is necessarily aligned. 1650 1651 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1652 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1653 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1654 1655 Devising a cost model is the most critical aspect of this work. It will 1656 guide us on which access to peel for, whether to use loop versioning, how 1657 many versions to create, etc. The cost model will probably consist of 1658 generic considerations as well as target specific considerations (on 1659 powerpc for example, misaligned stores are more painful than misaligned 1660 loads). 1661 1662 Here are the general steps involved in alignment enhancements: 1663 1664 -- original loop, before alignment analysis: 1665 for (i=0; i<N; i++){ 1666 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1667 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1668 } 1669 1670 -- After vect_compute_data_refs_alignment: 1671 for (i=0; i<N; i++){ 1672 x = q[i]; # DR_MISALIGNMENT(q) = 3 1673 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1674 } 1675 1676 -- Possibility 1: we do loop versioning: 1677 if (p is aligned) { 1678 for (i=0; i<N; i++){ # loop 1A 1679 x = q[i]; # DR_MISALIGNMENT(q) = 3 1680 p[i] = y; # DR_MISALIGNMENT(p) = 0 1681 } 1682 } 1683 else { 1684 for (i=0; i<N; i++){ # loop 1B 1685 x = q[i]; # DR_MISALIGNMENT(q) = 3 1686 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1687 } 1688 } 1689 1690 -- Possibility 2: we do loop peeling: 1691 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1692 x = q[i]; 1693 p[i] = y; 1694 } 1695 for (i = 3; i < N; i++){ # loop 2A 1696 x = q[i]; # DR_MISALIGNMENT(q) = 0 1697 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1698 } 1699 1700 -- Possibility 3: combination of loop peeling and versioning: 1701 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1702 x = q[i]; 1703 p[i] = y; 1704 } 1705 if (p is aligned) { 1706 for (i = 3; i<N; i++){ # loop 3A 1707 x = q[i]; # DR_MISALIGNMENT(q) = 0 1708 p[i] = y; # DR_MISALIGNMENT(p) = 0 1709 } 1710 } 1711 else { 1712 for (i = 3; i<N; i++){ # loop 3B 1713 x = q[i]; # DR_MISALIGNMENT(q) = 0 1714 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1715 } 1716 } 1717 1718 These loops are later passed to loop_transform to be vectorized. The 1719 vectorizer will use the alignment information to guide the transformation 1720 (whether to generate regular loads/stores, or with special handling for 1721 misalignment). */ 1722 1723 bool 1724 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1725 { 1726 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1727 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1728 enum dr_alignment_support supportable_dr_alignment; 1729 struct data_reference *dr0 = NULL, *first_store = NULL; 1730 struct data_reference *dr; 1731 unsigned int i, j; 1732 bool do_peeling = false; 1733 bool do_versioning = false; 1734 bool stat; 1735 gimple *stmt; 1736 stmt_vec_info stmt_info; 1737 unsigned int npeel = 0; 1738 bool one_misalignment_known = false; 1739 bool one_misalignment_unknown = false; 1740 bool one_dr_unsupportable = false; 1741 struct data_reference *unsupportable_dr = NULL; 1742 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1743 unsigned possible_npeel_number = 1; 1744 tree vectype; 1745 unsigned int mis, same_align_drs_max = 0; 1746 hash_table<peel_info_hasher> peeling_htab (1); 1747 1748 if (dump_enabled_p ()) 1749 dump_printf_loc (MSG_NOTE, vect_location, 1750 "=== vect_enhance_data_refs_alignment ===\n"); 1751 1752 /* Reset data so we can safely be called multiple times. */ 1753 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1754 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1755 1756 /* While cost model enhancements are expected in the future, the high level 1757 view of the code at this time is as follows: 1758 1759 A) If there is a misaligned access then see if peeling to align 1760 this access can make all data references satisfy 1761 vect_supportable_dr_alignment. If so, update data structures 1762 as needed and return true. 1763 1764 B) If peeling wasn't possible and there is a data reference with an 1765 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1766 then see if loop versioning checks can be used to make all data 1767 references satisfy vect_supportable_dr_alignment. If so, update 1768 data structures as needed and return true. 1769 1770 C) If neither peeling nor versioning were successful then return false if 1771 any data reference does not satisfy vect_supportable_dr_alignment. 1772 1773 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1774 1775 Note, Possibility 3 above (which is peeling and versioning together) is not 1776 being done at this time. */ 1777 1778 /* (1) Peeling to force alignment. */ 1779 1780 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1781 Considerations: 1782 + How many accesses will become aligned due to the peeling 1783 - How many accesses will become unaligned due to the peeling, 1784 and the cost of misaligned accesses. 1785 - The cost of peeling (the extra runtime checks, the increase 1786 in code size). */ 1787 1788 FOR_EACH_VEC_ELT (datarefs, i, dr) 1789 { 1790 stmt = DR_STMT (dr); 1791 stmt_info = vinfo_for_stmt (stmt); 1792 1793 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1794 continue; 1795 1796 /* For interleaving, only the alignment of the first access 1797 matters. */ 1798 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1799 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1800 continue; 1801 1802 /* For invariant accesses there is nothing to enhance. */ 1803 if (integer_zerop (DR_STEP (dr))) 1804 continue; 1805 1806 /* Strided accesses perform only component accesses, alignment is 1807 irrelevant for them. */ 1808 if (STMT_VINFO_STRIDED_P (stmt_info) 1809 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1810 continue; 1811 1812 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1813 do_peeling = vector_alignment_reachable_p (dr); 1814 if (do_peeling) 1815 { 1816 if (known_alignment_for_access_p (dr)) 1817 { 1818 unsigned int npeel_tmp = 0; 1819 bool negative = tree_int_cst_compare (DR_STEP (dr), 1820 size_zero_node) < 0; 1821 1822 vectype = STMT_VINFO_VECTYPE (stmt_info); 1823 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); 1824 unsigned int dr_size = vect_get_scalar_dr_size (dr); 1825 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr)); 1826 if (DR_MISALIGNMENT (dr) != 0) 1827 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1828 1829 /* For multiple types, it is possible that the bigger type access 1830 will have more than one peeling option. E.g., a loop with two 1831 types: one of size (vector size / 4), and the other one of 1832 size (vector size / 8). Vectorization factor will 8. If both 1833 accesses are misaligned by 3, the first one needs one scalar 1834 iteration to be aligned, and the second one needs 5. But the 1835 first one will be aligned also by peeling 5 scalar 1836 iterations, and in that case both accesses will be aligned. 1837 Hence, except for the immediate peeling amount, we also want 1838 to try to add full vector size, while we don't exceed 1839 vectorization factor. 1840 We do this automatically for cost model, since we calculate 1841 cost for every peeling option. */ 1842 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1843 { 1844 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1845 ? vf * GROUP_SIZE (stmt_info) : vf); 1846 possible_npeel_number 1847 = vect_get_num_vectors (nscalars, vectype); 1848 1849 /* NPEEL_TMP is 0 when there is no misalignment, but also 1850 allow peeling NELEMENTS. */ 1851 if (DR_MISALIGNMENT (dr) == 0) 1852 possible_npeel_number++; 1853 } 1854 1855 /* Save info about DR in the hash table. Also include peeling 1856 amounts according to the explanation above. */ 1857 for (j = 0; j < possible_npeel_number; j++) 1858 { 1859 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1860 dr, npeel_tmp); 1861 npeel_tmp += target_align / dr_size; 1862 } 1863 1864 one_misalignment_known = true; 1865 } 1866 else 1867 { 1868 /* If we don't know any misalignment values, we prefer 1869 peeling for data-ref that has the maximum number of data-refs 1870 with the same alignment, unless the target prefers to align 1871 stores over load. */ 1872 unsigned same_align_drs 1873 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1874 if (!dr0 1875 || same_align_drs_max < same_align_drs) 1876 { 1877 same_align_drs_max = same_align_drs; 1878 dr0 = dr; 1879 } 1880 /* For data-refs with the same number of related 1881 accesses prefer the one where the misalign 1882 computation will be invariant in the outermost loop. */ 1883 else if (same_align_drs_max == same_align_drs) 1884 { 1885 struct loop *ivloop0, *ivloop; 1886 ivloop0 = outermost_invariant_loop_for_expr 1887 (loop, DR_BASE_ADDRESS (dr0)); 1888 ivloop = outermost_invariant_loop_for_expr 1889 (loop, DR_BASE_ADDRESS (dr)); 1890 if ((ivloop && !ivloop0) 1891 || (ivloop && ivloop0 1892 && flow_loop_nested_p (ivloop, ivloop0))) 1893 dr0 = dr; 1894 } 1895 1896 one_misalignment_unknown = true; 1897 1898 /* Check for data refs with unsupportable alignment that 1899 can be peeled. */ 1900 if (!supportable_dr_alignment) 1901 { 1902 one_dr_unsupportable = true; 1903 unsupportable_dr = dr; 1904 } 1905 1906 if (!first_store && DR_IS_WRITE (dr)) 1907 first_store = dr; 1908 } 1909 } 1910 else 1911 { 1912 if (!aligned_access_p (dr)) 1913 { 1914 if (dump_enabled_p ()) 1915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1916 "vector alignment may not be reachable\n"); 1917 break; 1918 } 1919 } 1920 } 1921 1922 /* Check if we can possibly peel the loop. */ 1923 if (!vect_can_advance_ivs_p (loop_vinfo) 1924 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1925 || loop->inner) 1926 do_peeling = false; 1927 1928 struct _vect_peel_extended_info peel_for_known_alignment; 1929 struct _vect_peel_extended_info peel_for_unknown_alignment; 1930 struct _vect_peel_extended_info best_peel; 1931 1932 peel_for_unknown_alignment.inside_cost = INT_MAX; 1933 peel_for_unknown_alignment.outside_cost = INT_MAX; 1934 peel_for_unknown_alignment.peel_info.count = 0; 1935 1936 if (do_peeling 1937 && one_misalignment_unknown) 1938 { 1939 /* Check if the target requires to prefer stores over loads, i.e., if 1940 misaligned stores are more expensive than misaligned loads (taking 1941 drs with same alignment into account). */ 1942 unsigned int load_inside_cost = 0; 1943 unsigned int load_outside_cost = 0; 1944 unsigned int store_inside_cost = 0; 1945 unsigned int store_outside_cost = 0; 1946 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1947 1948 stmt_vector_for_cost dummy; 1949 dummy.create (2); 1950 vect_get_peeling_costs_all_drs (datarefs, dr0, 1951 &load_inside_cost, 1952 &load_outside_cost, 1953 &dummy, estimated_npeels, true); 1954 dummy.release (); 1955 1956 if (first_store) 1957 { 1958 dummy.create (2); 1959 vect_get_peeling_costs_all_drs (datarefs, first_store, 1960 &store_inside_cost, 1961 &store_outside_cost, 1962 &dummy, estimated_npeels, true); 1963 dummy.release (); 1964 } 1965 else 1966 { 1967 store_inside_cost = INT_MAX; 1968 store_outside_cost = INT_MAX; 1969 } 1970 1971 if (load_inside_cost > store_inside_cost 1972 || (load_inside_cost == store_inside_cost 1973 && load_outside_cost > store_outside_cost)) 1974 { 1975 dr0 = first_store; 1976 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1977 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1978 } 1979 else 1980 { 1981 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1982 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1983 } 1984 1985 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1986 prologue_cost_vec.create (2); 1987 epilogue_cost_vec.create (2); 1988 1989 int dummy2; 1990 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1991 (loop_vinfo, estimated_npeels, &dummy2, 1992 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1993 &prologue_cost_vec, &epilogue_cost_vec); 1994 1995 prologue_cost_vec.release (); 1996 epilogue_cost_vec.release (); 1997 1998 peel_for_unknown_alignment.peel_info.count = 1 1999 + STMT_VINFO_SAME_ALIGN_REFS 2000 (vinfo_for_stmt (DR_STMT (dr0))).length (); 2001 } 2002 2003 peel_for_unknown_alignment.peel_info.npeel = 0; 2004 peel_for_unknown_alignment.peel_info.dr = dr0; 2005 2006 best_peel = peel_for_unknown_alignment; 2007 2008 peel_for_known_alignment.inside_cost = INT_MAX; 2009 peel_for_known_alignment.outside_cost = INT_MAX; 2010 peel_for_known_alignment.peel_info.count = 0; 2011 peel_for_known_alignment.peel_info.dr = NULL; 2012 2013 if (do_peeling && one_misalignment_known) 2014 { 2015 /* Peeling is possible, but there is no data access that is not supported 2016 unless aligned. So we try to choose the best possible peeling from 2017 the hash table. */ 2018 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 2019 (&peeling_htab, loop_vinfo); 2020 } 2021 2022 /* Compare costs of peeling for known and unknown alignment. */ 2023 if (peel_for_known_alignment.peel_info.dr != NULL 2024 && peel_for_unknown_alignment.inside_cost 2025 >= peel_for_known_alignment.inside_cost) 2026 { 2027 best_peel = peel_for_known_alignment; 2028 2029 /* If the best peeling for known alignment has NPEEL == 0, perform no 2030 peeling at all except if there is an unsupportable dr that we can 2031 align. */ 2032 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 2033 do_peeling = false; 2034 } 2035 2036 /* If there is an unsupportable data ref, prefer this over all choices so far 2037 since we'd have to discard a chosen peeling except when it accidentally 2038 aligned the unsupportable data ref. */ 2039 if (one_dr_unsupportable) 2040 dr0 = unsupportable_dr; 2041 else if (do_peeling) 2042 { 2043 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 2044 TODO: Use nopeel_outside_cost or get rid of it? */ 2045 unsigned nopeel_inside_cost = 0; 2046 unsigned nopeel_outside_cost = 0; 2047 2048 stmt_vector_for_cost dummy; 2049 dummy.create (2); 2050 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost, 2051 &nopeel_outside_cost, &dummy, 0, false); 2052 dummy.release (); 2053 2054 /* Add epilogue costs. As we do not peel for alignment here, no prologue 2055 costs will be recorded. */ 2056 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 2057 prologue_cost_vec.create (2); 2058 epilogue_cost_vec.create (2); 2059 2060 int dummy2; 2061 nopeel_outside_cost += vect_get_known_peeling_cost 2062 (loop_vinfo, 0, &dummy2, 2063 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 2064 &prologue_cost_vec, &epilogue_cost_vec); 2065 2066 prologue_cost_vec.release (); 2067 epilogue_cost_vec.release (); 2068 2069 npeel = best_peel.peel_info.npeel; 2070 dr0 = best_peel.peel_info.dr; 2071 2072 /* If no peeling is not more expensive than the best peeling we 2073 have so far, don't perform any peeling. */ 2074 if (nopeel_inside_cost <= best_peel.inside_cost) 2075 do_peeling = false; 2076 } 2077 2078 if (do_peeling) 2079 { 2080 stmt = DR_STMT (dr0); 2081 stmt_info = vinfo_for_stmt (stmt); 2082 vectype = STMT_VINFO_VECTYPE (stmt_info); 2083 2084 if (known_alignment_for_access_p (dr0)) 2085 { 2086 bool negative = tree_int_cst_compare (DR_STEP (dr0), 2087 size_zero_node) < 0; 2088 if (!npeel) 2089 { 2090 /* Since it's known at compile time, compute the number of 2091 iterations in the peeled loop (the peeling factor) for use in 2092 updating DR_MISALIGNMENT values. The peeling factor is the 2093 vectorization factor minus the misalignment as an element 2094 count. */ 2095 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0); 2096 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2097 npeel = ((mis & (target_align - 1)) 2098 / vect_get_scalar_dr_size (dr0)); 2099 } 2100 2101 /* For interleaved data access every iteration accesses all the 2102 members of the group, therefore we divide the number of iterations 2103 by the group size. */ 2104 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 2105 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2106 npeel /= GROUP_SIZE (stmt_info); 2107 2108 if (dump_enabled_p ()) 2109 dump_printf_loc (MSG_NOTE, vect_location, 2110 "Try peeling by %d\n", npeel); 2111 } 2112 2113 /* Ensure that all datarefs can be vectorized after the peel. */ 2114 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel)) 2115 do_peeling = false; 2116 2117 /* Check if all datarefs are supportable and log. */ 2118 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 2119 { 2120 stat = vect_verify_datarefs_alignment (loop_vinfo); 2121 if (!stat) 2122 do_peeling = false; 2123 else 2124 return stat; 2125 } 2126 2127 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2128 if (do_peeling) 2129 { 2130 unsigned max_allowed_peel 2131 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 2132 if (max_allowed_peel != (unsigned)-1) 2133 { 2134 unsigned max_peel = npeel; 2135 if (max_peel == 0) 2136 { 2137 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2138 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1; 2139 } 2140 if (max_peel > max_allowed_peel) 2141 { 2142 do_peeling = false; 2143 if (dump_enabled_p ()) 2144 dump_printf_loc (MSG_NOTE, vect_location, 2145 "Disable peeling, max peels reached: %d\n", max_peel); 2146 } 2147 } 2148 } 2149 2150 /* Cost model #2 - if peeling may result in a remaining loop not 2151 iterating enough to be vectorized then do not peel. Since this 2152 is a cost heuristic rather than a correctness decision, use the 2153 most likely runtime value for variable vectorization factors. */ 2154 if (do_peeling 2155 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2156 { 2157 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2158 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2159 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2160 < assumed_vf + max_peel) 2161 do_peeling = false; 2162 } 2163 2164 if (do_peeling) 2165 { 2166 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2167 If the misalignment of DR_i is identical to that of dr0 then set 2168 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2169 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2170 by the peeling factor times the element size of DR_i (MOD the 2171 vectorization factor times the size). Otherwise, the 2172 misalignment of DR_i must be set to unknown. */ 2173 FOR_EACH_VEC_ELT (datarefs, i, dr) 2174 if (dr != dr0) 2175 { 2176 /* Strided accesses perform only component accesses, alignment 2177 is irrelevant for them. */ 2178 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2179 if (STMT_VINFO_STRIDED_P (stmt_info) 2180 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2181 continue; 2182 2183 vect_update_misalignment_for_peel (dr, dr0, npeel); 2184 } 2185 2186 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 2187 if (npeel) 2188 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2189 else 2190 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2191 = DR_MISALIGNMENT (dr0); 2192 SET_DR_MISALIGNMENT (dr0, 0); 2193 if (dump_enabled_p ()) 2194 { 2195 dump_printf_loc (MSG_NOTE, vect_location, 2196 "Alignment of access forced using peeling.\n"); 2197 dump_printf_loc (MSG_NOTE, vect_location, 2198 "Peeling for alignment will be applied.\n"); 2199 } 2200 2201 /* The inside-loop cost will be accounted for in vectorizable_load 2202 and vectorizable_store correctly with adjusted alignments. 2203 Drop the body_cst_vec on the floor here. */ 2204 stat = vect_verify_datarefs_alignment (loop_vinfo); 2205 gcc_assert (stat); 2206 return stat; 2207 } 2208 } 2209 2210 /* (2) Versioning to force alignment. */ 2211 2212 /* Try versioning if: 2213 1) optimize loop for speed 2214 2) there is at least one unsupported misaligned data ref with an unknown 2215 misalignment, and 2216 3) all misaligned data refs with a known misalignment are supported, and 2217 4) the number of runtime alignment checks is within reason. */ 2218 2219 do_versioning = 2220 optimize_loop_nest_for_speed_p (loop) 2221 && (!loop->inner); /* FORNOW */ 2222 2223 if (do_versioning) 2224 { 2225 FOR_EACH_VEC_ELT (datarefs, i, dr) 2226 { 2227 stmt = DR_STMT (dr); 2228 stmt_info = vinfo_for_stmt (stmt); 2229 2230 /* For interleaving, only the alignment of the first access 2231 matters. */ 2232 if (aligned_access_p (dr) 2233 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2234 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2235 continue; 2236 2237 if (STMT_VINFO_STRIDED_P (stmt_info)) 2238 { 2239 /* Strided loads perform only component accesses, alignment is 2240 irrelevant for them. */ 2241 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2242 continue; 2243 do_versioning = false; 2244 break; 2245 } 2246 2247 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2248 2249 if (!supportable_dr_alignment) 2250 { 2251 gimple *stmt; 2252 int mask; 2253 tree vectype; 2254 2255 if (known_alignment_for_access_p (dr) 2256 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2257 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2258 { 2259 do_versioning = false; 2260 break; 2261 } 2262 2263 stmt = DR_STMT (dr); 2264 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2265 gcc_assert (vectype); 2266 2267 /* At present we don't support versioning for alignment 2268 with variable VF, since there's no guarantee that the 2269 VF is a power of two. We could relax this if we added 2270 a way of enforcing a power-of-two size. */ 2271 unsigned HOST_WIDE_INT size; 2272 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2273 { 2274 do_versioning = false; 2275 break; 2276 } 2277 2278 /* The rightmost bits of an aligned address must be zeros. 2279 Construct the mask needed for this test. For example, 2280 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2281 mask must be 15 = 0xf. */ 2282 mask = size - 1; 2283 2284 /* FORNOW: use the same mask to test all potentially unaligned 2285 references in the loop. The vectorizer currently supports 2286 a single vector size, see the reference to 2287 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2288 vectorization factor is computed. */ 2289 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2290 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2291 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2292 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2293 DR_STMT (dr)); 2294 } 2295 } 2296 2297 /* Versioning requires at least one misaligned data reference. */ 2298 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2299 do_versioning = false; 2300 else if (!do_versioning) 2301 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2302 } 2303 2304 if (do_versioning) 2305 { 2306 vec<gimple *> may_misalign_stmts 2307 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2308 gimple *stmt; 2309 2310 /* It can now be assumed that the data references in the statements 2311 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2312 of the loop being vectorized. */ 2313 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2314 { 2315 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2316 dr = STMT_VINFO_DATA_REF (stmt_info); 2317 SET_DR_MISALIGNMENT (dr, 0); 2318 if (dump_enabled_p ()) 2319 dump_printf_loc (MSG_NOTE, vect_location, 2320 "Alignment of access forced using versioning.\n"); 2321 } 2322 2323 if (dump_enabled_p ()) 2324 dump_printf_loc (MSG_NOTE, vect_location, 2325 "Versioning for alignment will be applied.\n"); 2326 2327 /* Peeling and versioning can't be done together at this time. */ 2328 gcc_assert (! (do_peeling && do_versioning)); 2329 2330 stat = vect_verify_datarefs_alignment (loop_vinfo); 2331 gcc_assert (stat); 2332 return stat; 2333 } 2334 2335 /* This point is reached if neither peeling nor versioning is being done. */ 2336 gcc_assert (! (do_peeling || do_versioning)); 2337 2338 stat = vect_verify_datarefs_alignment (loop_vinfo); 2339 return stat; 2340 } 2341 2342 2343 /* Function vect_find_same_alignment_drs. 2344 2345 Update group and alignment relations according to the chosen 2346 vectorization factor. */ 2347 2348 static void 2349 vect_find_same_alignment_drs (struct data_dependence_relation *ddr) 2350 { 2351 struct data_reference *dra = DDR_A (ddr); 2352 struct data_reference *drb = DDR_B (ddr); 2353 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2354 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2355 2356 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2357 return; 2358 2359 if (dra == drb) 2360 return; 2361 2362 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2363 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2364 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2365 return; 2366 2367 /* Two references with distance zero have the same alignment. */ 2368 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2369 - wi::to_poly_offset (DR_INIT (drb))); 2370 if (maybe_ne (diff, 0)) 2371 { 2372 /* Get the wider of the two alignments. */ 2373 unsigned int align_a = (vect_calculate_target_alignment (dra) 2374 / BITS_PER_UNIT); 2375 unsigned int align_b = (vect_calculate_target_alignment (drb) 2376 / BITS_PER_UNIT); 2377 unsigned int max_align = MAX (align_a, align_b); 2378 2379 /* Require the gap to be a multiple of the larger vector alignment. */ 2380 if (!multiple_p (diff, max_align)) 2381 return; 2382 } 2383 2384 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2385 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2386 if (dump_enabled_p ()) 2387 { 2388 dump_printf_loc (MSG_NOTE, vect_location, 2389 "accesses have the same alignment: "); 2390 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2391 dump_printf (MSG_NOTE, " and "); 2392 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2393 dump_printf (MSG_NOTE, "\n"); 2394 } 2395 } 2396 2397 2398 /* Function vect_analyze_data_refs_alignment 2399 2400 Analyze the alignment of the data-references in the loop. 2401 Return FALSE if a data reference is found that cannot be vectorized. */ 2402 2403 bool 2404 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2405 { 2406 if (dump_enabled_p ()) 2407 dump_printf_loc (MSG_NOTE, vect_location, 2408 "=== vect_analyze_data_refs_alignment ===\n"); 2409 2410 /* Mark groups of data references with same alignment using 2411 data dependence information. */ 2412 vec<ddr_p> ddrs = vinfo->ddrs; 2413 struct data_dependence_relation *ddr; 2414 unsigned int i; 2415 2416 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2417 vect_find_same_alignment_drs (ddr); 2418 2419 vec<data_reference_p> datarefs = vinfo->datarefs; 2420 struct data_reference *dr; 2421 2422 vect_record_base_alignments (vinfo); 2423 FOR_EACH_VEC_ELT (datarefs, i, dr) 2424 { 2425 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2426 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2427 && !vect_compute_data_ref_alignment (dr)) 2428 { 2429 /* Strided accesses perform only component accesses, misalignment 2430 information is irrelevant for them. */ 2431 if (STMT_VINFO_STRIDED_P (stmt_info) 2432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2433 continue; 2434 2435 if (dump_enabled_p ()) 2436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2437 "not vectorized: can't calculate alignment " 2438 "for data ref.\n"); 2439 2440 return false; 2441 } 2442 } 2443 2444 return true; 2445 } 2446 2447 2448 /* Analyze alignment of DRs of stmts in NODE. */ 2449 2450 static bool 2451 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2452 { 2453 /* We vectorize from the first scalar stmt in the node unless 2454 the node is permuted in which case we start from the first 2455 element in the group. */ 2456 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2457 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2458 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2459 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2460 2461 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2462 if (! vect_compute_data_ref_alignment (dr) 2463 /* For creating the data-ref pointer we need alignment of the 2464 first element anyway. */ 2465 || (dr != first_dr 2466 && ! vect_compute_data_ref_alignment (first_dr)) 2467 || ! verify_data_ref_alignment (dr)) 2468 { 2469 if (dump_enabled_p ()) 2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2471 "not vectorized: bad data alignment in basic " 2472 "block.\n"); 2473 return false; 2474 } 2475 2476 return true; 2477 } 2478 2479 /* Function vect_slp_analyze_instance_alignment 2480 2481 Analyze the alignment of the data-references in the SLP instance. 2482 Return FALSE if a data reference is found that cannot be vectorized. */ 2483 2484 bool 2485 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2486 { 2487 if (dump_enabled_p ()) 2488 dump_printf_loc (MSG_NOTE, vect_location, 2489 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2490 2491 slp_tree node; 2492 unsigned i; 2493 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2494 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2495 return false; 2496 2497 node = SLP_INSTANCE_TREE (instance); 2498 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2499 && ! vect_slp_analyze_and_verify_node_alignment 2500 (SLP_INSTANCE_TREE (instance))) 2501 return false; 2502 2503 return true; 2504 } 2505 2506 2507 /* Analyze groups of accesses: check that DR belongs to a group of 2508 accesses of legal size, step, etc. Detect gaps, single element 2509 interleaving, and other special cases. Set grouped access info. 2510 Collect groups of strided stores for further use in SLP analysis. 2511 Worker for vect_analyze_group_access. */ 2512 2513 static bool 2514 vect_analyze_group_access_1 (struct data_reference *dr) 2515 { 2516 tree step = DR_STEP (dr); 2517 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2518 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2519 gimple *stmt = DR_STMT (dr); 2520 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2521 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2522 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2523 HOST_WIDE_INT dr_step = -1; 2524 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2525 bool slp_impossible = false; 2526 2527 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2528 size of the interleaving group (including gaps). */ 2529 if (tree_fits_shwi_p (step)) 2530 { 2531 dr_step = tree_to_shwi (step); 2532 /* Check that STEP is a multiple of type size. Otherwise there is 2533 a non-element-sized gap at the end of the group which we 2534 cannot represent in GROUP_GAP or GROUP_SIZE. 2535 ??? As we can handle non-constant step fine here we should 2536 simply remove uses of GROUP_GAP between the last and first 2537 element and instead rely on DR_STEP. GROUP_SIZE then would 2538 simply not include that gap. */ 2539 if ((dr_step % type_size) != 0) 2540 { 2541 if (dump_enabled_p ()) 2542 { 2543 dump_printf_loc (MSG_NOTE, vect_location, 2544 "Step "); 2545 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2546 dump_printf (MSG_NOTE, 2547 " is not a multiple of the element size for "); 2548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2549 dump_printf (MSG_NOTE, "\n"); 2550 } 2551 return false; 2552 } 2553 groupsize = absu_hwi (dr_step) / type_size; 2554 } 2555 else 2556 groupsize = 0; 2557 2558 /* Not consecutive access is possible only if it is a part of interleaving. */ 2559 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2560 { 2561 /* Check if it this DR is a part of interleaving, and is a single 2562 element of the group that is accessed in the loop. */ 2563 2564 /* Gaps are supported only for loads. STEP must be a multiple of the type 2565 size. */ 2566 if (DR_IS_READ (dr) 2567 && (dr_step % type_size) == 0 2568 && groupsize > 0) 2569 { 2570 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2571 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2572 GROUP_GAP (stmt_info) = groupsize - 1; 2573 if (dump_enabled_p ()) 2574 { 2575 dump_printf_loc (MSG_NOTE, vect_location, 2576 "Detected single element interleaving "); 2577 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2578 dump_printf (MSG_NOTE, " step "); 2579 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2580 dump_printf (MSG_NOTE, "\n"); 2581 } 2582 2583 return true; 2584 } 2585 2586 if (dump_enabled_p ()) 2587 { 2588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2589 "not consecutive access "); 2590 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2591 } 2592 2593 if (bb_vinfo) 2594 { 2595 /* Mark the statement as unvectorizable. */ 2596 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2597 return true; 2598 } 2599 2600 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2601 STMT_VINFO_STRIDED_P (stmt_info) = true; 2602 return true; 2603 } 2604 2605 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2606 { 2607 /* First stmt in the interleaving chain. Check the chain. */ 2608 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2609 struct data_reference *data_ref = dr; 2610 unsigned int count = 1; 2611 tree prev_init = DR_INIT (data_ref); 2612 gimple *prev = stmt; 2613 HOST_WIDE_INT diff, gaps = 0; 2614 2615 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2616 while (next) 2617 { 2618 /* Skip same data-refs. In case that two or more stmts share 2619 data-ref (supported only for loads), we vectorize only the first 2620 stmt, and the rest get their vectorized loads from the first 2621 one. */ 2622 if (!tree_int_cst_compare (DR_INIT (data_ref), 2623 DR_INIT (STMT_VINFO_DATA_REF ( 2624 vinfo_for_stmt (next))))) 2625 { 2626 if (DR_IS_WRITE (data_ref)) 2627 { 2628 if (dump_enabled_p ()) 2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2630 "Two store stmts share the same dr.\n"); 2631 return false; 2632 } 2633 2634 if (dump_enabled_p ()) 2635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2636 "Two or more load stmts share the same dr.\n"); 2637 2638 /* For load use the same data-ref load. */ 2639 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2640 2641 prev = next; 2642 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2643 continue; 2644 } 2645 2646 prev = next; 2647 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2648 2649 /* All group members have the same STEP by construction. */ 2650 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2651 2652 /* Check that the distance between two accesses is equal to the type 2653 size. Otherwise, we have gaps. */ 2654 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2655 - TREE_INT_CST_LOW (prev_init)) / type_size; 2656 if (diff != 1) 2657 { 2658 /* FORNOW: SLP of accesses with gaps is not supported. */ 2659 slp_impossible = true; 2660 if (DR_IS_WRITE (data_ref)) 2661 { 2662 if (dump_enabled_p ()) 2663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2664 "interleaved store with gaps\n"); 2665 return false; 2666 } 2667 2668 gaps += diff - 1; 2669 } 2670 2671 last_accessed_element += diff; 2672 2673 /* Store the gap from the previous member of the group. If there is no 2674 gap in the access, GROUP_GAP is always 1. */ 2675 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2676 2677 prev_init = DR_INIT (data_ref); 2678 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2679 /* Count the number of data-refs in the chain. */ 2680 count++; 2681 } 2682 2683 if (groupsize == 0) 2684 groupsize = count + gaps; 2685 2686 /* This could be UINT_MAX but as we are generating code in a very 2687 inefficient way we have to cap earlier. See PR78699 for example. */ 2688 if (groupsize > 4096) 2689 { 2690 if (dump_enabled_p ()) 2691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2692 "group is too large\n"); 2693 return false; 2694 } 2695 2696 /* Check that the size of the interleaving is equal to count for stores, 2697 i.e., that there are no gaps. */ 2698 if (groupsize != count 2699 && !DR_IS_READ (dr)) 2700 { 2701 if (dump_enabled_p ()) 2702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2703 "interleaved store with gaps\n"); 2704 return false; 2705 } 2706 2707 /* If there is a gap after the last load in the group it is the 2708 difference between the groupsize and the last accessed 2709 element. 2710 When there is no gap, this difference should be 0. */ 2711 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2712 2713 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2714 if (dump_enabled_p ()) 2715 { 2716 dump_printf_loc (MSG_NOTE, vect_location, 2717 "Detected interleaving "); 2718 if (DR_IS_READ (dr)) 2719 dump_printf (MSG_NOTE, "load "); 2720 else 2721 dump_printf (MSG_NOTE, "store "); 2722 dump_printf (MSG_NOTE, "of size %u starting with ", 2723 (unsigned)groupsize); 2724 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2725 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2726 dump_printf_loc (MSG_NOTE, vect_location, 2727 "There is a gap of %u elements after the group\n", 2728 GROUP_GAP (vinfo_for_stmt (stmt))); 2729 } 2730 2731 /* SLP: create an SLP data structure for every interleaving group of 2732 stores for further analysis in vect_analyse_slp. */ 2733 if (DR_IS_WRITE (dr) && !slp_impossible) 2734 { 2735 if (loop_vinfo) 2736 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2737 if (bb_vinfo) 2738 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2739 } 2740 } 2741 2742 return true; 2743 } 2744 2745 /* Analyze groups of accesses: check that DR belongs to a group of 2746 accesses of legal size, step, etc. Detect gaps, single element 2747 interleaving, and other special cases. Set grouped access info. 2748 Collect groups of strided stores for further use in SLP analysis. */ 2749 2750 static bool 2751 vect_analyze_group_access (struct data_reference *dr) 2752 { 2753 if (!vect_analyze_group_access_1 (dr)) 2754 { 2755 /* Dissolve the group if present. */ 2756 gimple *next; 2757 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2758 while (stmt) 2759 { 2760 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2761 next = GROUP_NEXT_ELEMENT (vinfo); 2762 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2763 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2764 stmt = next; 2765 } 2766 return false; 2767 } 2768 return true; 2769 } 2770 2771 /* Analyze the access pattern of the data-reference DR. 2772 In case of non-consecutive accesses call vect_analyze_group_access() to 2773 analyze groups of accesses. */ 2774 2775 static bool 2776 vect_analyze_data_ref_access (struct data_reference *dr) 2777 { 2778 tree step = DR_STEP (dr); 2779 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2780 gimple *stmt = DR_STMT (dr); 2781 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2782 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2783 struct loop *loop = NULL; 2784 2785 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2786 return true; 2787 2788 if (loop_vinfo) 2789 loop = LOOP_VINFO_LOOP (loop_vinfo); 2790 2791 if (loop_vinfo && !step) 2792 { 2793 if (dump_enabled_p ()) 2794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2795 "bad data-ref access in loop\n"); 2796 return false; 2797 } 2798 2799 /* Allow loads with zero step in inner-loop vectorization. */ 2800 if (loop_vinfo && integer_zerop (step)) 2801 { 2802 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2803 if (!nested_in_vect_loop_p (loop, stmt)) 2804 return DR_IS_READ (dr); 2805 /* Allow references with zero step for outer loops marked 2806 with pragma omp simd only - it guarantees absence of 2807 loop-carried dependencies between inner loop iterations. */ 2808 if (loop->safelen < 2) 2809 { 2810 if (dump_enabled_p ()) 2811 dump_printf_loc (MSG_NOTE, vect_location, 2812 "zero step in inner loop of nest\n"); 2813 return false; 2814 } 2815 } 2816 2817 if (loop && nested_in_vect_loop_p (loop, stmt)) 2818 { 2819 /* Interleaved accesses are not yet supported within outer-loop 2820 vectorization for references in the inner-loop. */ 2821 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2822 2823 /* For the rest of the analysis we use the outer-loop step. */ 2824 step = STMT_VINFO_DR_STEP (stmt_info); 2825 if (integer_zerop (step)) 2826 { 2827 if (dump_enabled_p ()) 2828 dump_printf_loc (MSG_NOTE, vect_location, 2829 "zero step in outer loop.\n"); 2830 return DR_IS_READ (dr); 2831 } 2832 } 2833 2834 /* Consecutive? */ 2835 if (TREE_CODE (step) == INTEGER_CST) 2836 { 2837 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2838 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2839 || (dr_step < 0 2840 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2841 { 2842 /* Mark that it is not interleaving. */ 2843 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2844 return true; 2845 } 2846 } 2847 2848 if (loop && nested_in_vect_loop_p (loop, stmt)) 2849 { 2850 if (dump_enabled_p ()) 2851 dump_printf_loc (MSG_NOTE, vect_location, 2852 "grouped access in outer loop.\n"); 2853 return false; 2854 } 2855 2856 2857 /* Assume this is a DR handled by non-constant strided load case. */ 2858 if (TREE_CODE (step) != INTEGER_CST) 2859 return (STMT_VINFO_STRIDED_P (stmt_info) 2860 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2861 || vect_analyze_group_access (dr))); 2862 2863 /* Not consecutive access - check if it's a part of interleaving group. */ 2864 return vect_analyze_group_access (dr); 2865 } 2866 2867 /* Compare two data-references DRA and DRB to group them into chunks 2868 suitable for grouping. */ 2869 2870 static int 2871 dr_group_sort_cmp (const void *dra_, const void *drb_) 2872 { 2873 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2874 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2875 int cmp; 2876 2877 /* Stabilize sort. */ 2878 if (dra == drb) 2879 return 0; 2880 2881 /* DRs in different loops never belong to the same group. */ 2882 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2883 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2884 if (loopa != loopb) 2885 return loopa->num < loopb->num ? -1 : 1; 2886 2887 /* Ordering of DRs according to base. */ 2888 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2889 DR_BASE_ADDRESS (drb)); 2890 if (cmp != 0) 2891 return cmp; 2892 2893 /* And according to DR_OFFSET. */ 2894 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2895 if (cmp != 0) 2896 return cmp; 2897 2898 /* Put reads before writes. */ 2899 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2900 return DR_IS_READ (dra) ? -1 : 1; 2901 2902 /* Then sort after access size. */ 2903 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2904 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2905 if (cmp != 0) 2906 return cmp; 2907 2908 /* And after step. */ 2909 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2910 if (cmp != 0) 2911 return cmp; 2912 2913 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2914 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2915 if (cmp == 0) 2916 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2917 return cmp; 2918 } 2919 2920 /* If OP is the result of a conversion, return the unconverted value, 2921 otherwise return null. */ 2922 2923 static tree 2924 strip_conversion (tree op) 2925 { 2926 if (TREE_CODE (op) != SSA_NAME) 2927 return NULL_TREE; 2928 gimple *stmt = SSA_NAME_DEF_STMT (op); 2929 if (!is_gimple_assign (stmt) 2930 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2931 return NULL_TREE; 2932 return gimple_assign_rhs1 (stmt); 2933 } 2934 2935 /* Return true if vectorizable_* routines can handle statements STMT1 2936 and STMT2 being in a single group. */ 2937 2938 static bool 2939 can_group_stmts_p (gimple *stmt1, gimple *stmt2) 2940 { 2941 if (gimple_assign_single_p (stmt1)) 2942 return gimple_assign_single_p (stmt2); 2943 2944 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1)) 2945 { 2946 /* Check for two masked loads or two masked stores. */ 2947 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2)) 2948 return false; 2949 internal_fn ifn = gimple_call_internal_fn (stmt1); 2950 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2951 return false; 2952 if (ifn != gimple_call_internal_fn (stmt2)) 2953 return false; 2954 2955 /* Check that the masks are the same. Cope with casts of masks, 2956 like those created by build_mask_conversion. */ 2957 tree mask1 = gimple_call_arg (stmt1, 2); 2958 tree mask2 = gimple_call_arg (stmt2, 2); 2959 if (!operand_equal_p (mask1, mask2, 0)) 2960 { 2961 mask1 = strip_conversion (mask1); 2962 if (!mask1) 2963 return false; 2964 mask2 = strip_conversion (mask2); 2965 if (!mask2) 2966 return false; 2967 if (!operand_equal_p (mask1, mask2, 0)) 2968 return false; 2969 } 2970 return true; 2971 } 2972 2973 return false; 2974 } 2975 2976 /* Function vect_analyze_data_ref_accesses. 2977 2978 Analyze the access pattern of all the data references in the loop. 2979 2980 FORNOW: the only access pattern that is considered vectorizable is a 2981 simple step 1 (consecutive) access. 2982 2983 FORNOW: handle only arrays and pointer accesses. */ 2984 2985 bool 2986 vect_analyze_data_ref_accesses (vec_info *vinfo) 2987 { 2988 unsigned int i; 2989 vec<data_reference_p> datarefs = vinfo->datarefs; 2990 struct data_reference *dr; 2991 2992 if (dump_enabled_p ()) 2993 dump_printf_loc (MSG_NOTE, vect_location, 2994 "=== vect_analyze_data_ref_accesses ===\n"); 2995 2996 if (datarefs.is_empty ()) 2997 return true; 2998 2999 /* Sort the array of datarefs to make building the interleaving chains 3000 linear. Don't modify the original vector's order, it is needed for 3001 determining what dependencies are reversed. */ 3002 vec<data_reference_p> datarefs_copy = datarefs.copy (); 3003 datarefs_copy.qsort (dr_group_sort_cmp); 3004 3005 /* Build the interleaving chains. */ 3006 for (i = 0; i < datarefs_copy.length () - 1;) 3007 { 3008 data_reference_p dra = datarefs_copy[i]; 3009 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 3010 stmt_vec_info lastinfo = NULL; 3011 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 3012 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 3013 { 3014 ++i; 3015 continue; 3016 } 3017 for (i = i + 1; i < datarefs_copy.length (); ++i) 3018 { 3019 data_reference_p drb = datarefs_copy[i]; 3020 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 3021 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 3022 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 3023 break; 3024 3025 /* ??? Imperfect sorting (non-compatible types, non-modulo 3026 accesses, same accesses) can lead to a group to be artificially 3027 split here as we don't just skip over those. If it really 3028 matters we can push those to a worklist and re-iterate 3029 over them. The we can just skip ahead to the next DR here. */ 3030 3031 /* DRs in a different loop should not be put into the same 3032 interleaving group. */ 3033 if (gimple_bb (DR_STMT (dra))->loop_father 3034 != gimple_bb (DR_STMT (drb))->loop_father) 3035 break; 3036 3037 /* Check that the data-refs have same first location (except init) 3038 and they are both either store or load (not load and store, 3039 not masked loads or stores). */ 3040 if (DR_IS_READ (dra) != DR_IS_READ (drb) 3041 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 3042 DR_BASE_ADDRESS (drb)) != 0 3043 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 3044 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb))) 3045 break; 3046 3047 /* Check that the data-refs have the same constant size. */ 3048 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 3049 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 3050 if (!tree_fits_uhwi_p (sza) 3051 || !tree_fits_uhwi_p (szb) 3052 || !tree_int_cst_equal (sza, szb)) 3053 break; 3054 3055 /* Check that the data-refs have the same step. */ 3056 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 3057 break; 3058 3059 /* Check the types are compatible. 3060 ??? We don't distinguish this during sorting. */ 3061 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 3062 TREE_TYPE (DR_REF (drb)))) 3063 break; 3064 3065 /* Check that the DR_INITs are compile-time constants. */ 3066 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 3067 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 3068 break; 3069 3070 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 3071 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 3072 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 3073 HOST_WIDE_INT init_prev 3074 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 3075 gcc_assert (init_a <= init_b 3076 && init_a <= init_prev 3077 && init_prev <= init_b); 3078 3079 /* Do not place the same access in the interleaving chain twice. */ 3080 if (init_b == init_prev) 3081 { 3082 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3083 < gimple_uid (DR_STMT (drb))); 3084 /* ??? For now we simply "drop" the later reference which is 3085 otherwise the same rather than finishing off this group. 3086 In the end we'd want to re-process duplicates forming 3087 multiple groups from the refs, likely by just collecting 3088 all candidates (including duplicates and split points 3089 below) in a vector and then process them together. */ 3090 continue; 3091 } 3092 3093 /* If init_b == init_a + the size of the type * k, we have an 3094 interleaving, and DRA is accessed before DRB. */ 3095 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3096 if (type_size_a == 0 3097 || (init_b - init_a) % type_size_a != 0) 3098 break; 3099 3100 /* If we have a store, the accesses are adjacent. This splits 3101 groups into chunks we support (we don't support vectorization 3102 of stores with gaps). */ 3103 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3104 break; 3105 3106 /* If the step (if not zero or non-constant) is greater than the 3107 difference between data-refs' inits this splits groups into 3108 suitable sizes. */ 3109 if (tree_fits_shwi_p (DR_STEP (dra))) 3110 { 3111 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3112 if (step != 0 && step <= (init_b - init_a)) 3113 break; 3114 } 3115 3116 if (dump_enabled_p ()) 3117 { 3118 dump_printf_loc (MSG_NOTE, vect_location, 3119 "Detected interleaving "); 3120 if (DR_IS_READ (dra)) 3121 dump_printf (MSG_NOTE, "load "); 3122 else 3123 dump_printf (MSG_NOTE, "store "); 3124 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 3125 dump_printf (MSG_NOTE, " and "); 3126 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 3127 dump_printf (MSG_NOTE, "\n"); 3128 } 3129 3130 /* Link the found element into the group list. */ 3131 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 3132 { 3133 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 3134 lastinfo = stmtinfo_a; 3135 } 3136 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 3137 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 3138 lastinfo = stmtinfo_b; 3139 } 3140 } 3141 3142 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3143 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 3144 && !vect_analyze_data_ref_access (dr)) 3145 { 3146 if (dump_enabled_p ()) 3147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3148 "not vectorized: complicated access pattern.\n"); 3149 3150 if (is_a <bb_vec_info> (vinfo)) 3151 { 3152 /* Mark the statement as not vectorizable. */ 3153 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3154 continue; 3155 } 3156 else 3157 { 3158 datarefs_copy.release (); 3159 return false; 3160 } 3161 } 3162 3163 datarefs_copy.release (); 3164 return true; 3165 } 3166 3167 /* Function vect_vfa_segment_size. 3168 3169 Input: 3170 DR: The data reference. 3171 LENGTH_FACTOR: segment length to consider. 3172 3173 Return a value suitable for the dr_with_seg_len::seg_len field. 3174 This is the "distance travelled" by the pointer from the first 3175 iteration in the segment to the last. Note that it does not include 3176 the size of the access; in effect it only describes the first byte. */ 3177 3178 static tree 3179 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3180 { 3181 length_factor = size_binop (MINUS_EXPR, 3182 fold_convert (sizetype, length_factor), 3183 size_one_node); 3184 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)), 3185 length_factor); 3186 } 3187 3188 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)), 3189 gives the worst-case number of bytes covered by the segment. */ 3190 3191 static unsigned HOST_WIDE_INT 3192 vect_vfa_access_size (data_reference *dr) 3193 { 3194 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr)); 3195 tree ref_type = TREE_TYPE (DR_REF (dr)); 3196 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3197 unsigned HOST_WIDE_INT access_size = ref_size; 3198 if (GROUP_FIRST_ELEMENT (stmt_vinfo)) 3199 { 3200 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr)); 3201 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo); 3202 } 3203 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3204 && (vect_supportable_dr_alignment (dr, false) 3205 == dr_explicit_realign_optimized)) 3206 { 3207 /* We might access a full vector's worth. */ 3208 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3209 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3210 } 3211 return access_size; 3212 } 3213 3214 /* Get the minimum alignment for all the scalar accesses that DR describes. */ 3215 3216 static unsigned int 3217 vect_vfa_align (const data_reference *dr) 3218 { 3219 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr))); 3220 } 3221 3222 /* Function vect_no_alias_p. 3223 3224 Given data references A and B with equal base and offset, see whether 3225 the alias relation can be decided at compilation time. Return 1 if 3226 it can and the references alias, 0 if it can and the references do 3227 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3228 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3229 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3230 3231 static int 3232 vect_compile_time_alias (struct data_reference *a, struct data_reference *b, 3233 tree segment_length_a, tree segment_length_b, 3234 unsigned HOST_WIDE_INT access_size_a, 3235 unsigned HOST_WIDE_INT access_size_b) 3236 { 3237 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a)); 3238 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b)); 3239 poly_uint64 const_length_a; 3240 poly_uint64 const_length_b; 3241 3242 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3243 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3244 [a, a+12) */ 3245 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3246 { 3247 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3248 offset_a -= const_length_a; 3249 } 3250 else 3251 const_length_a = tree_to_poly_uint64 (segment_length_a); 3252 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3253 { 3254 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3255 offset_b -= const_length_b; 3256 } 3257 else 3258 const_length_b = tree_to_poly_uint64 (segment_length_b); 3259 3260 const_length_a += access_size_a; 3261 const_length_b += access_size_b; 3262 3263 if (ranges_known_overlap_p (offset_a, const_length_a, 3264 offset_b, const_length_b)) 3265 return 1; 3266 3267 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3268 offset_b, const_length_b)) 3269 return 0; 3270 3271 return -1; 3272 } 3273 3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3275 in DDR is >= VF. */ 3276 3277 static bool 3278 dependence_distance_ge_vf (data_dependence_relation *ddr, 3279 unsigned int loop_depth, poly_uint64 vf) 3280 { 3281 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3282 || DDR_NUM_DIST_VECTS (ddr) == 0) 3283 return false; 3284 3285 /* If the dependence is exact, we should have limited the VF instead. */ 3286 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3287 3288 unsigned int i; 3289 lambda_vector dist_v; 3290 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3291 { 3292 HOST_WIDE_INT dist = dist_v[loop_depth]; 3293 if (dist != 0 3294 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3295 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3296 return false; 3297 } 3298 3299 if (dump_enabled_p ()) 3300 { 3301 dump_printf_loc (MSG_NOTE, vect_location, 3302 "dependence distance between "); 3303 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 3304 dump_printf (MSG_NOTE, " and "); 3305 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 3306 dump_printf (MSG_NOTE, " is >= VF\n"); 3307 } 3308 3309 return true; 3310 } 3311 3312 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3313 3314 static void 3315 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound) 3316 { 3317 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs"); 3318 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr); 3319 dump_printf (dump_kind, ") >= "); 3320 dump_dec (dump_kind, lower_bound.min_value); 3321 } 3322 3323 /* Record that the vectorized loop requires the vec_lower_bound described 3324 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3325 3326 static void 3327 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3328 poly_uint64 min_value) 3329 { 3330 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3331 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3332 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3333 { 3334 unsigned_p &= lower_bounds[i].unsigned_p; 3335 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3336 if (lower_bounds[i].unsigned_p != unsigned_p 3337 || maybe_lt (lower_bounds[i].min_value, min_value)) 3338 { 3339 lower_bounds[i].unsigned_p = unsigned_p; 3340 lower_bounds[i].min_value = min_value; 3341 if (dump_enabled_p ()) 3342 { 3343 dump_printf_loc (MSG_NOTE, vect_location, 3344 "updating run-time check to "); 3345 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3346 dump_printf (MSG_NOTE, "\n"); 3347 } 3348 } 3349 return; 3350 } 3351 3352 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3353 if (dump_enabled_p ()) 3354 { 3355 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3356 dump_lower_bound (MSG_NOTE, lower_bound); 3357 dump_printf (MSG_NOTE, "\n"); 3358 } 3359 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3360 } 3361 3362 /* Return true if it's unlikely that the step of the vectorized form of DR 3363 will span fewer than GAP bytes. */ 3364 3365 static bool 3366 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap) 3367 { 3368 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 3369 HOST_WIDE_INT count 3370 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3371 if (GROUP_FIRST_ELEMENT (stmt_info)) 3372 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 3373 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr); 3374 } 3375 3376 /* Return true if we know that there is no alias between DR_A and DR_B 3377 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set 3378 *LOWER_BOUND_OUT to this N. */ 3379 3380 static bool 3381 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b, 3382 poly_uint64 *lower_bound_out) 3383 { 3384 /* Check that there is a constant gap of known sign between DR_A 3385 and DR_B. */ 3386 poly_int64 init_a, init_b; 3387 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3388 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3389 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3390 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3391 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3392 || !ordered_p (init_a, init_b)) 3393 return false; 3394 3395 /* Sort DR_A and DR_B by the address they access. */ 3396 if (maybe_lt (init_b, init_a)) 3397 { 3398 std::swap (init_a, init_b); 3399 std::swap (dr_a, dr_b); 3400 } 3401 3402 /* If the two accesses could be dependent within a scalar iteration, 3403 make sure that we'd retain their order. */ 3404 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b) 3405 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b))) 3406 return false; 3407 3408 /* There is no alias if abs (DR_STEP) is greater than or equal to 3409 the bytes spanned by the combination of the two accesses. */ 3410 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a; 3411 return true; 3412 } 3413 3414 /* Function vect_prune_runtime_alias_test_list. 3415 3416 Prune a list of ddrs to be tested at run-time by versioning for alias. 3417 Merge several alias checks into one if possible. 3418 Return FALSE if resulting list of ddrs is longer then allowed by 3419 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3420 3421 bool 3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3423 { 3424 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3425 hash_set <tree_pair_hash> compared_objects; 3426 3427 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3428 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3429 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3430 vec<vec_object_pair> &check_unequal_addrs 3431 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3432 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3433 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3434 3435 ddr_p ddr; 3436 unsigned int i; 3437 tree length_factor; 3438 3439 if (dump_enabled_p ()) 3440 dump_printf_loc (MSG_NOTE, vect_location, 3441 "=== vect_prune_runtime_alias_test_list ===\n"); 3442 3443 /* Step values are irrelevant for aliasing if the number of vector 3444 iterations is equal to the number of scalar iterations (which can 3445 happen for fully-SLP loops). */ 3446 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3447 3448 if (!ignore_step_p) 3449 { 3450 /* Convert the checks for nonzero steps into bound tests. */ 3451 tree value; 3452 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3453 vect_check_lower_bound (loop_vinfo, value, true, 1); 3454 } 3455 3456 if (may_alias_ddrs.is_empty ()) 3457 return true; 3458 3459 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3460 3461 unsigned int loop_depth 3462 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3463 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3464 3465 /* First, we collect all data ref pairs for aliasing checks. */ 3466 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3467 { 3468 int comp_res; 3469 poly_uint64 lower_bound; 3470 struct data_reference *dr_a, *dr_b; 3471 gimple *dr_group_first_a, *dr_group_first_b; 3472 tree segment_length_a, segment_length_b; 3473 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3474 unsigned int align_a, align_b; 3475 gimple *stmt_a, *stmt_b; 3476 3477 /* Ignore the alias if the VF we chose ended up being no greater 3478 than the dependence distance. */ 3479 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3480 continue; 3481 3482 if (DDR_OBJECT_A (ddr)) 3483 { 3484 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3485 if (!compared_objects.add (new_pair)) 3486 { 3487 if (dump_enabled_p ()) 3488 { 3489 dump_printf_loc (MSG_NOTE, vect_location, "checking that "); 3490 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first); 3491 dump_printf (MSG_NOTE, " and "); 3492 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second); 3493 dump_printf (MSG_NOTE, " have different addresses\n"); 3494 } 3495 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3496 } 3497 continue; 3498 } 3499 3500 dr_a = DDR_A (ddr); 3501 stmt_a = DR_STMT (DDR_A (ddr)); 3502 3503 dr_b = DDR_B (ddr); 3504 stmt_b = DR_STMT (DDR_B (ddr)); 3505 3506 /* Skip the pair if inter-iteration dependencies are irrelevant 3507 and intra-iteration dependencies are guaranteed to be honored. */ 3508 if (ignore_step_p 3509 && (vect_preserves_scalar_order_p (stmt_a, stmt_b) 3510 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound))) 3511 { 3512 if (dump_enabled_p ()) 3513 { 3514 dump_printf_loc (MSG_NOTE, vect_location, 3515 "no need for alias check between "); 3516 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3517 dump_printf (MSG_NOTE, " and "); 3518 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3519 dump_printf (MSG_NOTE, " when VF is 1\n"); 3520 } 3521 continue; 3522 } 3523 3524 /* See whether we can handle the alias using a bounds check on 3525 the step, and whether that's likely to be the best approach. 3526 (It might not be, for example, if the minimum step is much larger 3527 than the number of bytes handled by one vector iteration.) */ 3528 if (!ignore_step_p 3529 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST 3530 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound) 3531 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound) 3532 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound))) 3533 { 3534 bool unsigned_p = dr_known_forward_stride_p (dr_a); 3535 if (dump_enabled_p ()) 3536 { 3537 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "); 3538 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3539 dump_printf (MSG_NOTE, " and "); 3540 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3541 dump_printf (MSG_NOTE, " when the step "); 3542 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a)); 3543 dump_printf (MSG_NOTE, " is outside "); 3544 if (unsigned_p) 3545 dump_printf (MSG_NOTE, "[0"); 3546 else 3547 { 3548 dump_printf (MSG_NOTE, "("); 3549 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3550 } 3551 dump_printf (MSG_NOTE, ", "); 3552 dump_dec (MSG_NOTE, lower_bound); 3553 dump_printf (MSG_NOTE, ")\n"); 3554 } 3555 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p, 3556 lower_bound); 3557 continue; 3558 } 3559 3560 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3561 if (dr_group_first_a) 3562 { 3563 stmt_a = dr_group_first_a; 3564 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3565 } 3566 3567 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3568 if (dr_group_first_b) 3569 { 3570 stmt_b = dr_group_first_b; 3571 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3572 } 3573 3574 if (ignore_step_p) 3575 { 3576 segment_length_a = size_zero_node; 3577 segment_length_b = size_zero_node; 3578 } 3579 else 3580 { 3581 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3582 length_factor = scalar_loop_iters; 3583 else 3584 length_factor = size_int (vect_factor); 3585 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3586 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3587 } 3588 access_size_a = vect_vfa_access_size (dr_a); 3589 access_size_b = vect_vfa_access_size (dr_b); 3590 align_a = vect_vfa_align (dr_a); 3591 align_b = vect_vfa_align (dr_b); 3592 3593 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 3594 DR_BASE_ADDRESS (dr_b)); 3595 if (comp_res == 0) 3596 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), 3597 DR_OFFSET (dr_b)); 3598 3599 /* See whether the alias is known at compilation time. */ 3600 if (comp_res == 0 3601 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3602 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3603 && poly_int_tree_p (segment_length_a) 3604 && poly_int_tree_p (segment_length_b)) 3605 { 3606 int res = vect_compile_time_alias (dr_a, dr_b, 3607 segment_length_a, 3608 segment_length_b, 3609 access_size_a, 3610 access_size_b); 3611 if (res >= 0 && dump_enabled_p ()) 3612 { 3613 dump_printf_loc (MSG_NOTE, vect_location, 3614 "can tell at compile time that "); 3615 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3616 dump_printf (MSG_NOTE, " and "); 3617 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3618 if (res == 0) 3619 dump_printf (MSG_NOTE, " do not alias\n"); 3620 else 3621 dump_printf (MSG_NOTE, " alias\n"); 3622 } 3623 3624 if (res == 0) 3625 continue; 3626 3627 if (res == 1) 3628 { 3629 if (dump_enabled_p ()) 3630 dump_printf_loc (MSG_NOTE, vect_location, 3631 "not vectorized: compilation time alias.\n"); 3632 return false; 3633 } 3634 } 3635 3636 dr_with_seg_len_pair_t dr_with_seg_len_pair 3637 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a), 3638 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b)); 3639 3640 /* Canonicalize pairs by sorting the two DR members. */ 3641 if (comp_res > 0) 3642 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3643 3644 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3645 } 3646 3647 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3648 3649 unsigned int count = (comp_alias_ddrs.length () 3650 + check_unequal_addrs.length ()); 3651 3652 dump_printf_loc (MSG_NOTE, vect_location, 3653 "improved number of alias checks from %d to %d\n", 3654 may_alias_ddrs.length (), count); 3655 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3656 { 3657 if (dump_enabled_p ()) 3658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3659 "number of versioning for alias " 3660 "run-time tests exceeds %d " 3661 "(--param vect-max-version-for-alias-checks)\n", 3662 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3663 return false; 3664 } 3665 3666 return true; 3667 } 3668 3669 /* Check whether we can use an internal function for a gather load 3670 or scatter store. READ_P is true for loads and false for stores. 3671 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3672 the type of the memory elements being loaded or stored. OFFSET_BITS 3673 is the number of bits in each scalar offset and OFFSET_SIGN is the 3674 sign of the offset. SCALE is the amount by which the offset should 3675 be multiplied *after* it has been converted to address width. 3676 3677 Return true if the function is supported, storing the function 3678 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ 3679 3680 bool 3681 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, 3682 tree memory_type, unsigned int offset_bits, 3683 signop offset_sign, int scale, 3684 internal_fn *ifn_out, tree *element_type_out) 3685 { 3686 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3687 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3688 if (offset_bits > element_bits) 3689 /* Internal functions require the offset to be the same width as 3690 the vector elements. We can extend narrower offsets, but it isn't 3691 safe to truncate wider offsets. */ 3692 return false; 3693 3694 if (element_bits != memory_bits) 3695 /* For now the vector elements must be the same width as the 3696 memory elements. */ 3697 return false; 3698 3699 /* Work out which function we need. */ 3700 internal_fn ifn; 3701 if (read_p) 3702 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3703 else 3704 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3705 3706 /* Test whether the target supports this combination. */ 3707 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3708 offset_sign, scale)) 3709 return false; 3710 3711 *ifn_out = ifn; 3712 *element_type_out = TREE_TYPE (vectype); 3713 return true; 3714 } 3715 3716 /* CALL is a call to an internal gather load or scatter store function. 3717 Describe the operation in INFO. */ 3718 3719 static void 3720 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info) 3721 { 3722 stmt_vec_info stmt_info = vinfo_for_stmt (call); 3723 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3724 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3725 3726 info->ifn = gimple_call_internal_fn (call); 3727 info->decl = NULL_TREE; 3728 info->base = gimple_call_arg (call, 0); 3729 info->offset = gimple_call_arg (call, 1); 3730 info->offset_dt = vect_unknown_def_type; 3731 info->offset_vectype = NULL_TREE; 3732 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3733 info->element_type = TREE_TYPE (vectype); 3734 info->memory_type = TREE_TYPE (DR_REF (dr)); 3735 } 3736 3737 /* Return true if a non-affine read or write in STMT is suitable for a 3738 gather load or scatter store. Describe the operation in *INFO if so. */ 3739 3740 bool 3741 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3742 gather_scatter_info *info) 3743 { 3744 HOST_WIDE_INT scale = 1; 3745 poly_int64 pbitpos, pbitsize; 3746 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3747 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3748 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3749 tree offtype = NULL_TREE; 3750 tree decl = NULL_TREE, base, off; 3751 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3752 tree memory_type = TREE_TYPE (DR_REF (dr)); 3753 machine_mode pmode; 3754 int punsignedp, reversep, pvolatilep = 0; 3755 internal_fn ifn; 3756 tree element_type; 3757 bool masked_p = false; 3758 3759 /* See whether this is already a call to a gather/scatter internal function. 3760 If not, see whether it's a masked load or store. */ 3761 gcall *call = dyn_cast <gcall *> (stmt); 3762 if (call && gimple_call_internal_p (call)) 3763 { 3764 ifn = gimple_call_internal_fn (stmt); 3765 if (internal_gather_scatter_fn_p (ifn)) 3766 { 3767 vect_describe_gather_scatter_call (call, info); 3768 return true; 3769 } 3770 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3771 } 3772 3773 /* True if we should aim to use internal functions rather than 3774 built-in functions. */ 3775 bool use_ifn_p = (DR_IS_READ (dr) 3776 ? supports_vec_gather_load_p () 3777 : supports_vec_scatter_store_p ()); 3778 3779 base = DR_REF (dr); 3780 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3781 see if we can use the def stmt of the address. */ 3782 if (masked_p 3783 && TREE_CODE (base) == MEM_REF 3784 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3785 && integer_zerop (TREE_OPERAND (base, 1)) 3786 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3787 { 3788 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3789 if (is_gimple_assign (def_stmt) 3790 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3791 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3792 } 3793 3794 /* The gather and scatter builtins need address of the form 3795 loop_invariant + vector * {1, 2, 4, 8} 3796 or 3797 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3798 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3799 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3800 multiplications and additions in it. To get a vector, we need 3801 a single SSA_NAME that will be defined in the loop and will 3802 contain everything that is not loop invariant and that can be 3803 vectorized. The following code attempts to find such a preexistng 3804 SSA_NAME OFF and put the loop invariants into a tree BASE 3805 that can be gimplified before the loop. */ 3806 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3807 &punsignedp, &reversep, &pvolatilep); 3808 gcc_assert (base && !reversep); 3809 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3810 3811 if (TREE_CODE (base) == MEM_REF) 3812 { 3813 if (!integer_zerop (TREE_OPERAND (base, 1))) 3814 { 3815 if (off == NULL_TREE) 3816 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3817 else 3818 off = size_binop (PLUS_EXPR, off, 3819 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3820 } 3821 base = TREE_OPERAND (base, 0); 3822 } 3823 else 3824 base = build_fold_addr_expr (base); 3825 3826 if (off == NULL_TREE) 3827 off = size_zero_node; 3828 3829 /* If base is not loop invariant, either off is 0, then we start with just 3830 the constant offset in the loop invariant BASE and continue with base 3831 as OFF, otherwise give up. 3832 We could handle that case by gimplifying the addition of base + off 3833 into some SSA_NAME and use that as off, but for now punt. */ 3834 if (!expr_invariant_in_loop_p (loop, base)) 3835 { 3836 if (!integer_zerop (off)) 3837 return false; 3838 off = base; 3839 base = size_int (pbytepos); 3840 } 3841 /* Otherwise put base + constant offset into the loop invariant BASE 3842 and continue with OFF. */ 3843 else 3844 { 3845 base = fold_convert (sizetype, base); 3846 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3847 } 3848 3849 /* OFF at this point may be either a SSA_NAME or some tree expression 3850 from get_inner_reference. Try to peel off loop invariants from it 3851 into BASE as long as possible. */ 3852 STRIP_NOPS (off); 3853 while (offtype == NULL_TREE) 3854 { 3855 enum tree_code code; 3856 tree op0, op1, add = NULL_TREE; 3857 3858 if (TREE_CODE (off) == SSA_NAME) 3859 { 3860 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3861 3862 if (expr_invariant_in_loop_p (loop, off)) 3863 return false; 3864 3865 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3866 break; 3867 3868 op0 = gimple_assign_rhs1 (def_stmt); 3869 code = gimple_assign_rhs_code (def_stmt); 3870 op1 = gimple_assign_rhs2 (def_stmt); 3871 } 3872 else 3873 { 3874 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3875 return false; 3876 code = TREE_CODE (off); 3877 extract_ops_from_tree (off, &code, &op0, &op1); 3878 } 3879 switch (code) 3880 { 3881 case POINTER_PLUS_EXPR: 3882 case PLUS_EXPR: 3883 if (expr_invariant_in_loop_p (loop, op0)) 3884 { 3885 add = op0; 3886 off = op1; 3887 do_add: 3888 add = fold_convert (sizetype, add); 3889 if (scale != 1) 3890 add = size_binop (MULT_EXPR, add, size_int (scale)); 3891 base = size_binop (PLUS_EXPR, base, add); 3892 continue; 3893 } 3894 if (expr_invariant_in_loop_p (loop, op1)) 3895 { 3896 add = op1; 3897 off = op0; 3898 goto do_add; 3899 } 3900 break; 3901 case MINUS_EXPR: 3902 if (expr_invariant_in_loop_p (loop, op1)) 3903 { 3904 add = fold_convert (sizetype, op1); 3905 add = size_binop (MINUS_EXPR, size_zero_node, add); 3906 off = op0; 3907 goto do_add; 3908 } 3909 break; 3910 case MULT_EXPR: 3911 if (scale == 1 && tree_fits_shwi_p (op1)) 3912 { 3913 int new_scale = tree_to_shwi (op1); 3914 /* Only treat this as a scaling operation if the target 3915 supports it. */ 3916 if (use_ifn_p 3917 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, 3918 vectype, memory_type, 1, 3919 TYPE_SIGN (TREE_TYPE (op0)), 3920 new_scale, &ifn, 3921 &element_type)) 3922 break; 3923 scale = new_scale; 3924 off = op0; 3925 continue; 3926 } 3927 break; 3928 case SSA_NAME: 3929 off = op0; 3930 continue; 3931 CASE_CONVERT: 3932 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3933 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3934 break; 3935 if (TYPE_PRECISION (TREE_TYPE (op0)) 3936 == TYPE_PRECISION (TREE_TYPE (off))) 3937 { 3938 off = op0; 3939 continue; 3940 } 3941 3942 /* The internal functions need the offset to be the same width 3943 as the elements of VECTYPE. Don't include operations that 3944 cast the offset from that width to a different width. */ 3945 if (use_ifn_p 3946 && (int_size_in_bytes (TREE_TYPE (vectype)) 3947 == int_size_in_bytes (TREE_TYPE (off)))) 3948 break; 3949 3950 if (TYPE_PRECISION (TREE_TYPE (op0)) 3951 < TYPE_PRECISION (TREE_TYPE (off))) 3952 { 3953 off = op0; 3954 offtype = TREE_TYPE (off); 3955 STRIP_NOPS (off); 3956 continue; 3957 } 3958 break; 3959 default: 3960 break; 3961 } 3962 break; 3963 } 3964 3965 /* If at the end OFF still isn't a SSA_NAME or isn't 3966 defined in the loop, punt. */ 3967 if (TREE_CODE (off) != SSA_NAME 3968 || expr_invariant_in_loop_p (loop, off)) 3969 return false; 3970 3971 if (offtype == NULL_TREE) 3972 offtype = TREE_TYPE (off); 3973 3974 if (use_ifn_p) 3975 { 3976 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, 3977 memory_type, TYPE_PRECISION (offtype), 3978 TYPE_SIGN (offtype), scale, &ifn, 3979 &element_type)) 3980 return false; 3981 } 3982 else 3983 { 3984 if (DR_IS_READ (dr)) 3985 { 3986 if (targetm.vectorize.builtin_gather) 3987 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 3988 } 3989 else 3990 { 3991 if (targetm.vectorize.builtin_scatter) 3992 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 3993 } 3994 3995 if (!decl) 3996 return false; 3997 3998 ifn = IFN_LAST; 3999 element_type = TREE_TYPE (vectype); 4000 } 4001 4002 info->ifn = ifn; 4003 info->decl = decl; 4004 info->base = base; 4005 info->offset = off; 4006 info->offset_dt = vect_unknown_def_type; 4007 info->offset_vectype = NULL_TREE; 4008 info->scale = scale; 4009 info->element_type = element_type; 4010 info->memory_type = memory_type; 4011 return true; 4012 } 4013 4014 /* Function vect_analyze_data_refs. 4015 4016 Find all the data references in the loop or basic block. 4017 4018 The general structure of the analysis of data refs in the vectorizer is as 4019 follows: 4020 1- vect_analyze_data_refs(loop/bb): call 4021 compute_data_dependences_for_loop/bb to find and analyze all data-refs 4022 in the loop/bb and their dependences. 4023 2- vect_analyze_dependences(): apply dependence testing using ddrs. 4024 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 4025 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 4026 4027 */ 4028 4029 bool 4030 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) 4031 { 4032 struct loop *loop = NULL; 4033 unsigned int i; 4034 struct data_reference *dr; 4035 tree scalar_type; 4036 4037 if (dump_enabled_p ()) 4038 dump_printf_loc (MSG_NOTE, vect_location, 4039 "=== vect_analyze_data_refs ===\n"); 4040 4041 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 4042 loop = LOOP_VINFO_LOOP (loop_vinfo); 4043 4044 /* Go through the data-refs, check that the analysis succeeded. Update 4045 pointer from stmt_vec_info struct to DR and vectype. */ 4046 4047 vec<data_reference_p> datarefs = vinfo->datarefs; 4048 FOR_EACH_VEC_ELT (datarefs, i, dr) 4049 { 4050 gimple *stmt; 4051 stmt_vec_info stmt_info; 4052 tree base, offset, init; 4053 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 4054 bool simd_lane_access = false; 4055 poly_uint64 vf; 4056 4057 again: 4058 if (!dr || !DR_REF (dr)) 4059 { 4060 if (dump_enabled_p ()) 4061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4062 "not vectorized: unhandled data-ref\n"); 4063 return false; 4064 } 4065 4066 stmt = DR_STMT (dr); 4067 stmt_info = vinfo_for_stmt (stmt); 4068 4069 /* Discard clobbers from the dataref vector. We will remove 4070 clobber stmts during vectorization. */ 4071 if (gimple_clobber_p (stmt)) 4072 { 4073 free_data_ref (dr); 4074 if (i == datarefs.length () - 1) 4075 { 4076 datarefs.pop (); 4077 break; 4078 } 4079 datarefs.ordered_remove (i); 4080 dr = datarefs[i]; 4081 goto again; 4082 } 4083 4084 /* Check that analysis of the data-ref succeeded. */ 4085 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4086 || !DR_STEP (dr)) 4087 { 4088 bool maybe_gather 4089 = DR_IS_READ (dr) 4090 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4091 && (targetm.vectorize.builtin_gather != NULL 4092 || supports_vec_gather_load_p ()); 4093 bool maybe_scatter 4094 = DR_IS_WRITE (dr) 4095 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4096 && (targetm.vectorize.builtin_scatter != NULL 4097 || supports_vec_scatter_store_p ()); 4098 bool maybe_simd_lane_access 4099 = is_a <loop_vec_info> (vinfo) && loop->simduid; 4100 4101 /* If target supports vector gather loads or scatter stores, or if 4102 this might be a SIMD lane access, see if they can't be used. */ 4103 if (is_a <loop_vec_info> (vinfo) 4104 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 4105 && !nested_in_vect_loop_p (loop, stmt)) 4106 { 4107 struct data_reference *newdr 4108 = create_data_ref (NULL, loop_containing_stmt (stmt), 4109 DR_REF (dr), stmt, !maybe_scatter, 4110 DR_IS_CONDITIONAL_IN_STMT (dr)); 4111 gcc_assert (newdr != NULL && DR_REF (newdr)); 4112 if (DR_BASE_ADDRESS (newdr) 4113 && DR_OFFSET (newdr) 4114 && DR_INIT (newdr) 4115 && DR_STEP (newdr) 4116 && integer_zerop (DR_STEP (newdr))) 4117 { 4118 if (maybe_simd_lane_access) 4119 { 4120 tree off = DR_OFFSET (newdr); 4121 STRIP_NOPS (off); 4122 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4123 && TREE_CODE (off) == MULT_EXPR 4124 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4125 { 4126 tree step = TREE_OPERAND (off, 1); 4127 off = TREE_OPERAND (off, 0); 4128 STRIP_NOPS (off); 4129 if (CONVERT_EXPR_P (off) 4130 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 4131 0))) 4132 < TYPE_PRECISION (TREE_TYPE (off))) 4133 off = TREE_OPERAND (off, 0); 4134 if (TREE_CODE (off) == SSA_NAME) 4135 { 4136 gimple *def = SSA_NAME_DEF_STMT (off); 4137 tree reft = TREE_TYPE (DR_REF (newdr)); 4138 if (is_gimple_call (def) 4139 && gimple_call_internal_p (def) 4140 && (gimple_call_internal_fn (def) 4141 == IFN_GOMP_SIMD_LANE)) 4142 { 4143 tree arg = gimple_call_arg (def, 0); 4144 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4145 arg = SSA_NAME_VAR (arg); 4146 if (arg == loop->simduid 4147 /* For now. */ 4148 && tree_int_cst_equal 4149 (TYPE_SIZE_UNIT (reft), 4150 step)) 4151 { 4152 DR_OFFSET (newdr) = ssize_int (0); 4153 DR_STEP (newdr) = step; 4154 DR_OFFSET_ALIGNMENT (newdr) 4155 = BIGGEST_ALIGNMENT; 4156 DR_STEP_ALIGNMENT (newdr) 4157 = highest_pow2_factor (step); 4158 dr = newdr; 4159 simd_lane_access = true; 4160 } 4161 } 4162 } 4163 } 4164 } 4165 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 4166 { 4167 dr = newdr; 4168 if (maybe_gather) 4169 gatherscatter = GATHER; 4170 else 4171 gatherscatter = SCATTER; 4172 } 4173 } 4174 if (gatherscatter == SG_NONE && !simd_lane_access) 4175 free_data_ref (newdr); 4176 } 4177 4178 if (gatherscatter == SG_NONE && !simd_lane_access) 4179 { 4180 if (dump_enabled_p ()) 4181 { 4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4183 "not vectorized: data ref analysis " 4184 "failed "); 4185 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4186 } 4187 4188 if (is_a <bb_vec_info> (vinfo)) 4189 break; 4190 4191 return false; 4192 } 4193 } 4194 4195 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4196 { 4197 if (dump_enabled_p ()) 4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4199 "not vectorized: base addr of dr is a " 4200 "constant\n"); 4201 4202 if (is_a <bb_vec_info> (vinfo)) 4203 break; 4204 4205 if (gatherscatter != SG_NONE || simd_lane_access) 4206 free_data_ref (dr); 4207 return false; 4208 } 4209 4210 if (TREE_THIS_VOLATILE (DR_REF (dr))) 4211 { 4212 if (dump_enabled_p ()) 4213 { 4214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4215 "not vectorized: volatile type "); 4216 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4217 } 4218 4219 if (is_a <bb_vec_info> (vinfo)) 4220 break; 4221 4222 return false; 4223 } 4224 4225 if (stmt_can_throw_internal (stmt)) 4226 { 4227 if (dump_enabled_p ()) 4228 { 4229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4230 "not vectorized: statement can throw an " 4231 "exception "); 4232 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4233 } 4234 4235 if (is_a <bb_vec_info> (vinfo)) 4236 break; 4237 4238 if (gatherscatter != SG_NONE || simd_lane_access) 4239 free_data_ref (dr); 4240 return false; 4241 } 4242 4243 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4244 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4245 { 4246 if (dump_enabled_p ()) 4247 { 4248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4249 "not vectorized: statement is bitfield " 4250 "access "); 4251 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4252 } 4253 4254 if (is_a <bb_vec_info> (vinfo)) 4255 break; 4256 4257 if (gatherscatter != SG_NONE || simd_lane_access) 4258 free_data_ref (dr); 4259 return false; 4260 } 4261 4262 base = unshare_expr (DR_BASE_ADDRESS (dr)); 4263 offset = unshare_expr (DR_OFFSET (dr)); 4264 init = unshare_expr (DR_INIT (dr)); 4265 4266 if (is_gimple_call (stmt) 4267 && (!gimple_call_internal_p (stmt) 4268 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 4269 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 4270 { 4271 if (dump_enabled_p ()) 4272 { 4273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4274 "not vectorized: dr in a call "); 4275 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4276 } 4277 4278 if (is_a <bb_vec_info> (vinfo)) 4279 break; 4280 4281 if (gatherscatter != SG_NONE || simd_lane_access) 4282 free_data_ref (dr); 4283 return false; 4284 } 4285 4286 /* Update DR field in stmt_vec_info struct. */ 4287 4288 /* If the dataref is in an inner-loop of the loop that is considered for 4289 for vectorization, we also want to analyze the access relative to 4290 the outer-loop (DR contains information only relative to the 4291 inner-most enclosing loop). We do that by building a reference to the 4292 first location accessed by the inner-loop, and analyze it relative to 4293 the outer-loop. */ 4294 if (loop && nested_in_vect_loop_p (loop, stmt)) 4295 { 4296 /* Build a reference to the first location accessed by the 4297 inner loop: *(BASE + INIT + OFFSET). By construction, 4298 this address must be invariant in the inner loop, so we 4299 can consider it as being used in the outer loop. */ 4300 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4301 init, offset); 4302 tree init_addr = fold_build_pointer_plus (base, init_offset); 4303 tree init_ref = build_fold_indirect_ref (init_addr); 4304 4305 if (dump_enabled_p ()) 4306 { 4307 dump_printf_loc (MSG_NOTE, vect_location, 4308 "analyze in outer loop: "); 4309 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref); 4310 dump_printf (MSG_NOTE, "\n"); 4311 } 4312 4313 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4314 init_ref, loop)) 4315 /* dr_analyze_innermost already explained the failure. */ 4316 return false; 4317 4318 if (dump_enabled_p ()) 4319 { 4320 dump_printf_loc (MSG_NOTE, vect_location, 4321 "\touter base_address: "); 4322 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4323 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4324 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 4325 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4326 STMT_VINFO_DR_OFFSET (stmt_info)); 4327 dump_printf (MSG_NOTE, 4328 "\n\touter constant offset from base address: "); 4329 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4330 STMT_VINFO_DR_INIT (stmt_info)); 4331 dump_printf (MSG_NOTE, "\n\touter step: "); 4332 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4333 STMT_VINFO_DR_STEP (stmt_info)); 4334 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n", 4335 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info)); 4336 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n", 4337 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info)); 4338 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n", 4339 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info)); 4340 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n", 4341 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4342 } 4343 } 4344 4345 if (STMT_VINFO_DATA_REF (stmt_info)) 4346 { 4347 if (dump_enabled_p ()) 4348 { 4349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4350 "not vectorized: more than one data ref " 4351 "in stmt: "); 4352 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4353 } 4354 4355 if (is_a <bb_vec_info> (vinfo)) 4356 break; 4357 4358 if (gatherscatter != SG_NONE || simd_lane_access) 4359 free_data_ref (dr); 4360 return false; 4361 } 4362 4363 STMT_VINFO_DATA_REF (stmt_info) = dr; 4364 if (simd_lane_access) 4365 { 4366 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4367 free_data_ref (datarefs[i]); 4368 datarefs[i] = dr; 4369 } 4370 4371 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4372 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4373 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4374 { 4375 if (dump_enabled_p ()) 4376 { 4377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4378 "not vectorized: base object not addressable " 4379 "for stmt: "); 4380 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4381 } 4382 if (is_a <bb_vec_info> (vinfo)) 4383 { 4384 /* In BB vectorization the ref can still participate 4385 in dependence analysis, we just can't vectorize it. */ 4386 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4387 continue; 4388 } 4389 return false; 4390 } 4391 4392 /* Set vectype for STMT. */ 4393 scalar_type = TREE_TYPE (DR_REF (dr)); 4394 STMT_VINFO_VECTYPE (stmt_info) 4395 = get_vectype_for_scalar_type (scalar_type); 4396 if (!STMT_VINFO_VECTYPE (stmt_info)) 4397 { 4398 if (dump_enabled_p ()) 4399 { 4400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4401 "not vectorized: no vectype for stmt: "); 4402 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4403 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4404 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4405 scalar_type); 4406 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4407 } 4408 4409 if (is_a <bb_vec_info> (vinfo)) 4410 { 4411 /* No vector type is fine, the ref can still participate 4412 in dependence analysis, we just can't vectorize it. */ 4413 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4414 continue; 4415 } 4416 4417 if (gatherscatter != SG_NONE || simd_lane_access) 4418 { 4419 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4420 if (gatherscatter != SG_NONE) 4421 free_data_ref (dr); 4422 } 4423 return false; 4424 } 4425 else 4426 { 4427 if (dump_enabled_p ()) 4428 { 4429 dump_printf_loc (MSG_NOTE, vect_location, 4430 "got vectype for stmt: "); 4431 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4432 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4433 STMT_VINFO_VECTYPE (stmt_info)); 4434 dump_printf (MSG_NOTE, "\n"); 4435 } 4436 } 4437 4438 /* Adjust the minimal vectorization factor according to the 4439 vector type. */ 4440 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4441 *min_vf = upper_bound (*min_vf, vf); 4442 4443 if (gatherscatter != SG_NONE) 4444 { 4445 gather_scatter_info gs_info; 4446 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4447 &gs_info) 4448 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4449 { 4450 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4451 free_data_ref (dr); 4452 if (dump_enabled_p ()) 4453 { 4454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4455 (gatherscatter == GATHER) ? 4456 "not vectorized: not suitable for gather " 4457 "load " : 4458 "not vectorized: not suitable for scatter " 4459 "store "); 4460 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4461 } 4462 return false; 4463 } 4464 4465 free_data_ref (datarefs[i]); 4466 datarefs[i] = dr; 4467 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4468 } 4469 4470 else if (is_a <loop_vec_info> (vinfo) 4471 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4472 { 4473 if (nested_in_vect_loop_p (loop, stmt)) 4474 { 4475 if (dump_enabled_p ()) 4476 { 4477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4478 "not vectorized: not suitable for strided " 4479 "load "); 4480 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4481 } 4482 return false; 4483 } 4484 STMT_VINFO_STRIDED_P (stmt_info) = true; 4485 } 4486 } 4487 4488 /* If we stopped analysis at the first dataref we could not analyze 4489 when trying to vectorize a basic-block mark the rest of the datarefs 4490 as not vectorizable and truncate the vector of datarefs. That 4491 avoids spending useless time in analyzing their dependence. */ 4492 if (i != datarefs.length ()) 4493 { 4494 gcc_assert (is_a <bb_vec_info> (vinfo)); 4495 for (unsigned j = i; j < datarefs.length (); ++j) 4496 { 4497 data_reference_p dr = datarefs[j]; 4498 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4499 free_data_ref (dr); 4500 } 4501 datarefs.truncate (i); 4502 } 4503 4504 return true; 4505 } 4506 4507 4508 /* Function vect_get_new_vect_var. 4509 4510 Returns a name for a new variable. The current naming scheme appends the 4511 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4512 the name of vectorizer generated variables, and appends that to NAME if 4513 provided. */ 4514 4515 tree 4516 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4517 { 4518 const char *prefix; 4519 tree new_vect_var; 4520 4521 switch (var_kind) 4522 { 4523 case vect_simple_var: 4524 prefix = "vect"; 4525 break; 4526 case vect_scalar_var: 4527 prefix = "stmp"; 4528 break; 4529 case vect_mask_var: 4530 prefix = "mask"; 4531 break; 4532 case vect_pointer_var: 4533 prefix = "vectp"; 4534 break; 4535 default: 4536 gcc_unreachable (); 4537 } 4538 4539 if (name) 4540 { 4541 char* tmp = concat (prefix, "_", name, NULL); 4542 new_vect_var = create_tmp_reg (type, tmp); 4543 free (tmp); 4544 } 4545 else 4546 new_vect_var = create_tmp_reg (type, prefix); 4547 4548 return new_vect_var; 4549 } 4550 4551 /* Like vect_get_new_vect_var but return an SSA name. */ 4552 4553 tree 4554 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4555 { 4556 const char *prefix; 4557 tree new_vect_var; 4558 4559 switch (var_kind) 4560 { 4561 case vect_simple_var: 4562 prefix = "vect"; 4563 break; 4564 case vect_scalar_var: 4565 prefix = "stmp"; 4566 break; 4567 case vect_pointer_var: 4568 prefix = "vectp"; 4569 break; 4570 default: 4571 gcc_unreachable (); 4572 } 4573 4574 if (name) 4575 { 4576 char* tmp = concat (prefix, "_", name, NULL); 4577 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4578 free (tmp); 4579 } 4580 else 4581 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4582 4583 return new_vect_var; 4584 } 4585 4586 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4587 4588 static void 4589 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr) 4590 { 4591 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4592 int misalign = DR_MISALIGNMENT (dr); 4593 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4594 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4595 else 4596 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4597 DR_TARGET_ALIGNMENT (dr), misalign); 4598 } 4599 4600 /* Function vect_create_addr_base_for_vector_ref. 4601 4602 Create an expression that computes the address of the first memory location 4603 that will be accessed for a data reference. 4604 4605 Input: 4606 STMT: The statement containing the data reference. 4607 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4608 OFFSET: Optional. If supplied, it is be added to the initial address. 4609 LOOP: Specify relative to which loop-nest should the address be computed. 4610 For example, when the dataref is in an inner-loop nested in an 4611 outer-loop that is now being vectorized, LOOP can be either the 4612 outer-loop, or the inner-loop. The first memory location accessed 4613 by the following dataref ('in' points to short): 4614 4615 for (i=0; i<N; i++) 4616 for (j=0; j<M; j++) 4617 s += in[i+j] 4618 4619 is as follows: 4620 if LOOP=i_loop: &in (relative to i_loop) 4621 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4622 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4623 initial address. Unlike OFFSET, which is number of elements to 4624 be added, BYTE_OFFSET is measured in bytes. 4625 4626 Output: 4627 1. Return an SSA_NAME whose value is the address of the memory location of 4628 the first vector of the data reference. 4629 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4630 these statement(s) which define the returned SSA_NAME. 4631 4632 FORNOW: We are only handling array accesses with step 1. */ 4633 4634 tree 4635 vect_create_addr_base_for_vector_ref (gimple *stmt, 4636 gimple_seq *new_stmt_list, 4637 tree offset, 4638 tree byte_offset) 4639 { 4640 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4641 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4642 const char *base_name; 4643 tree addr_base; 4644 tree dest; 4645 gimple_seq seq = NULL; 4646 tree vect_ptr_type; 4647 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4648 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4649 innermost_loop_behavior *drb = vect_dr_behavior (dr); 4650 4651 tree data_ref_base = unshare_expr (drb->base_address); 4652 tree base_offset = unshare_expr (drb->offset); 4653 tree init = unshare_expr (drb->init); 4654 4655 if (loop_vinfo) 4656 base_name = get_name (data_ref_base); 4657 else 4658 { 4659 base_offset = ssize_int (0); 4660 init = ssize_int (0); 4661 base_name = get_name (DR_REF (dr)); 4662 } 4663 4664 /* Create base_offset */ 4665 base_offset = size_binop (PLUS_EXPR, 4666 fold_convert (sizetype, base_offset), 4667 fold_convert (sizetype, init)); 4668 4669 if (offset) 4670 { 4671 offset = fold_build2 (MULT_EXPR, sizetype, 4672 fold_convert (sizetype, offset), step); 4673 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4674 base_offset, offset); 4675 } 4676 if (byte_offset) 4677 { 4678 byte_offset = fold_convert (sizetype, byte_offset); 4679 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4680 base_offset, byte_offset); 4681 } 4682 4683 /* base + base_offset */ 4684 if (loop_vinfo) 4685 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4686 else 4687 { 4688 addr_base = build1 (ADDR_EXPR, 4689 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4690 unshare_expr (DR_REF (dr))); 4691 } 4692 4693 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4694 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4695 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4696 gimple_seq_add_seq (new_stmt_list, seq); 4697 4698 if (DR_PTR_INFO (dr) 4699 && TREE_CODE (addr_base) == SSA_NAME 4700 && !SSA_NAME_PTR_INFO (addr_base)) 4701 { 4702 vect_duplicate_ssa_name_ptr_info (addr_base, dr); 4703 if (offset || byte_offset) 4704 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4705 } 4706 4707 if (dump_enabled_p ()) 4708 { 4709 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4710 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4711 dump_printf (MSG_NOTE, "\n"); 4712 } 4713 4714 return addr_base; 4715 } 4716 4717 4718 /* Function vect_create_data_ref_ptr. 4719 4720 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4721 location accessed in the loop by STMT, along with the def-use update 4722 chain to appropriately advance the pointer through the loop iterations. 4723 Also set aliasing information for the pointer. This pointer is used by 4724 the callers to this function to create a memory reference expression for 4725 vector load/store access. 4726 4727 Input: 4728 1. STMT: a stmt that references memory. Expected to be of the form 4729 GIMPLE_ASSIGN <name, data-ref> or 4730 GIMPLE_ASSIGN <data-ref, name>. 4731 2. AGGR_TYPE: the type of the reference, which should be either a vector 4732 or an array. 4733 3. AT_LOOP: the loop where the vector memref is to be created. 4734 4. OFFSET (optional): an offset to be added to the initial address accessed 4735 by the data-ref in STMT. 4736 5. BSI: location where the new stmts are to be placed if there is no loop 4737 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4738 pointing to the initial address. 4739 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4740 to the initial address accessed by the data-ref in STMT. This is 4741 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4742 in bytes. 4743 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4744 to the IV during each iteration of the loop. NULL says to move 4745 by one copy of AGGR_TYPE up or down, depending on the step of the 4746 data reference. 4747 4748 Output: 4749 1. Declare a new ptr to vector_type, and have it point to the base of the 4750 data reference (initial addressed accessed by the data reference). 4751 For example, for vector of type V8HI, the following code is generated: 4752 4753 v8hi *ap; 4754 ap = (v8hi *)initial_address; 4755 4756 if OFFSET is not supplied: 4757 initial_address = &a[init]; 4758 if OFFSET is supplied: 4759 initial_address = &a[init + OFFSET]; 4760 if BYTE_OFFSET is supplied: 4761 initial_address = &a[init] + BYTE_OFFSET; 4762 4763 Return the initial_address in INITIAL_ADDRESS. 4764 4765 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4766 update the pointer in each iteration of the loop. 4767 4768 Return the increment stmt that updates the pointer in PTR_INCR. 4769 4770 3. Set INV_P to true if the access pattern of the data reference in the 4771 vectorized loop is invariant. Set it to false otherwise. 4772 4773 4. Return the pointer. */ 4774 4775 tree 4776 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4777 tree offset, tree *initial_address, 4778 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4779 bool only_init, bool *inv_p, tree byte_offset, 4780 tree iv_step) 4781 { 4782 const char *base_name; 4783 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4784 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4785 struct loop *loop = NULL; 4786 bool nested_in_vect_loop = false; 4787 struct loop *containing_loop = NULL; 4788 tree aggr_ptr_type; 4789 tree aggr_ptr; 4790 tree new_temp; 4791 gimple_seq new_stmt_list = NULL; 4792 edge pe = NULL; 4793 basic_block new_bb; 4794 tree aggr_ptr_init; 4795 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4796 tree aptr; 4797 gimple_stmt_iterator incr_gsi; 4798 bool insert_after; 4799 tree indx_before_incr, indx_after_incr; 4800 gimple *incr; 4801 tree step; 4802 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4803 4804 gcc_assert (iv_step != NULL_TREE 4805 || TREE_CODE (aggr_type) == ARRAY_TYPE 4806 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4807 4808 if (loop_vinfo) 4809 { 4810 loop = LOOP_VINFO_LOOP (loop_vinfo); 4811 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4812 containing_loop = (gimple_bb (stmt))->loop_father; 4813 pe = loop_preheader_edge (loop); 4814 } 4815 else 4816 { 4817 gcc_assert (bb_vinfo); 4818 only_init = true; 4819 *ptr_incr = NULL; 4820 } 4821 4822 /* Check the step (evolution) of the load in LOOP, and record 4823 whether it's invariant. */ 4824 step = vect_dr_behavior (dr)->step; 4825 if (integer_zerop (step)) 4826 *inv_p = true; 4827 else 4828 *inv_p = false; 4829 4830 /* Create an expression for the first address accessed by this load 4831 in LOOP. */ 4832 base_name = get_name (DR_BASE_ADDRESS (dr)); 4833 4834 if (dump_enabled_p ()) 4835 { 4836 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4837 dump_printf_loc (MSG_NOTE, vect_location, 4838 "create %s-pointer variable to type: ", 4839 get_tree_code_name (TREE_CODE (aggr_type))); 4840 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4841 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4842 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4843 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4844 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4845 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4846 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4847 else 4848 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4849 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4850 dump_printf (MSG_NOTE, "\n"); 4851 } 4852 4853 /* (1) Create the new aggregate-pointer variable. 4854 Vector and array types inherit the alias set of their component 4855 type by default so we need to use a ref-all pointer if the data 4856 reference does not conflict with the created aggregated data 4857 reference because it is not addressable. */ 4858 bool need_ref_all = false; 4859 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4860 get_alias_set (DR_REF (dr)))) 4861 need_ref_all = true; 4862 /* Likewise for any of the data references in the stmt group. */ 4863 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4864 { 4865 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4866 do 4867 { 4868 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4869 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4870 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4871 get_alias_set (DR_REF (sdr)))) 4872 { 4873 need_ref_all = true; 4874 break; 4875 } 4876 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4877 } 4878 while (orig_stmt); 4879 } 4880 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4881 need_ref_all); 4882 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4883 4884 4885 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4886 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4887 def-use update cycles for the pointer: one relative to the outer-loop 4888 (LOOP), which is what steps (3) and (4) below do. The other is relative 4889 to the inner-loop (which is the inner-most loop containing the dataref), 4890 and this is done be step (5) below. 4891 4892 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4893 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4894 redundant. Steps (3),(4) create the following: 4895 4896 vp0 = &base_addr; 4897 LOOP: vp1 = phi(vp0,vp2) 4898 ... 4899 ... 4900 vp2 = vp1 + step 4901 goto LOOP 4902 4903 If there is an inner-loop nested in loop, then step (5) will also be 4904 applied, and an additional update in the inner-loop will be created: 4905 4906 vp0 = &base_addr; 4907 LOOP: vp1 = phi(vp0,vp2) 4908 ... 4909 inner: vp3 = phi(vp1,vp4) 4910 vp4 = vp3 + inner_step 4911 if () goto inner 4912 ... 4913 vp2 = vp1 + step 4914 if () goto LOOP */ 4915 4916 /* (2) Calculate the initial address of the aggregate-pointer, and set 4917 the aggregate-pointer to point to it before the loop. */ 4918 4919 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4920 4921 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4922 offset, byte_offset); 4923 if (new_stmt_list) 4924 { 4925 if (pe) 4926 { 4927 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4928 gcc_assert (!new_bb); 4929 } 4930 else 4931 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4932 } 4933 4934 *initial_address = new_temp; 4935 aggr_ptr_init = new_temp; 4936 4937 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4938 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4939 inner-loop nested in LOOP (during outer-loop vectorization). */ 4940 4941 /* No update in loop is required. */ 4942 if (only_init && (!loop_vinfo || at_loop == loop)) 4943 aptr = aggr_ptr_init; 4944 else 4945 { 4946 if (iv_step == NULL_TREE) 4947 { 4948 /* The step of the aggregate pointer is the type size. */ 4949 iv_step = TYPE_SIZE_UNIT (aggr_type); 4950 /* One exception to the above is when the scalar step of the load in 4951 LOOP is zero. In this case the step here is also zero. */ 4952 if (*inv_p) 4953 iv_step = size_zero_node; 4954 else if (tree_int_cst_sgn (step) == -1) 4955 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4956 } 4957 4958 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4959 4960 create_iv (aggr_ptr_init, 4961 fold_convert (aggr_ptr_type, iv_step), 4962 aggr_ptr, loop, &incr_gsi, insert_after, 4963 &indx_before_incr, &indx_after_incr); 4964 incr = gsi_stmt (incr_gsi); 4965 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4966 4967 /* Copy the points-to information if it exists. */ 4968 if (DR_PTR_INFO (dr)) 4969 { 4970 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4971 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4972 } 4973 if (ptr_incr) 4974 *ptr_incr = incr; 4975 4976 aptr = indx_before_incr; 4977 } 4978 4979 if (!nested_in_vect_loop || only_init) 4980 return aptr; 4981 4982 4983 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4984 nested in LOOP, if exists. */ 4985 4986 gcc_assert (nested_in_vect_loop); 4987 if (!only_init) 4988 { 4989 standard_iv_increment_position (containing_loop, &incr_gsi, 4990 &insert_after); 4991 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4992 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4993 &indx_after_incr); 4994 incr = gsi_stmt (incr_gsi); 4995 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4996 4997 /* Copy the points-to information if it exists. */ 4998 if (DR_PTR_INFO (dr)) 4999 { 5000 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 5001 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 5002 } 5003 if (ptr_incr) 5004 *ptr_incr = incr; 5005 5006 return indx_before_incr; 5007 } 5008 else 5009 gcc_unreachable (); 5010 } 5011 5012 5013 /* Function bump_vector_ptr 5014 5015 Increment a pointer (to a vector type) by vector-size. If requested, 5016 i.e. if PTR-INCR is given, then also connect the new increment stmt 5017 to the existing def-use update-chain of the pointer, by modifying 5018 the PTR_INCR as illustrated below: 5019 5020 The pointer def-use update-chain before this function: 5021 DATAREF_PTR = phi (p_0, p_2) 5022 .... 5023 PTR_INCR: p_2 = DATAREF_PTR + step 5024 5025 The pointer def-use update-chain after this function: 5026 DATAREF_PTR = phi (p_0, p_2) 5027 .... 5028 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 5029 .... 5030 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 5031 5032 Input: 5033 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 5034 in the loop. 5035 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 5036 the loop. The increment amount across iterations is expected 5037 to be vector_size. 5038 BSI - location where the new update stmt is to be placed. 5039 STMT - the original scalar memory-access stmt that is being vectorized. 5040 BUMP - optional. The offset by which to bump the pointer. If not given, 5041 the offset is assumed to be vector_size. 5042 5043 Output: Return NEW_DATAREF_PTR as illustrated above. 5044 5045 */ 5046 5047 tree 5048 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 5049 gimple *stmt, tree bump) 5050 { 5051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5052 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5053 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5054 tree update = TYPE_SIZE_UNIT (vectype); 5055 gassign *incr_stmt; 5056 ssa_op_iter iter; 5057 use_operand_p use_p; 5058 tree new_dataref_ptr; 5059 5060 if (bump) 5061 update = bump; 5062 5063 if (TREE_CODE (dataref_ptr) == SSA_NAME) 5064 new_dataref_ptr = copy_ssa_name (dataref_ptr); 5065 else 5066 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 5067 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 5068 dataref_ptr, update); 5069 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 5070 5071 /* Copy the points-to information if it exists. */ 5072 if (DR_PTR_INFO (dr)) 5073 { 5074 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 5075 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 5076 } 5077 5078 if (!ptr_incr) 5079 return new_dataref_ptr; 5080 5081 /* Update the vector-pointer's cross-iteration increment. */ 5082 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 5083 { 5084 tree use = USE_FROM_PTR (use_p); 5085 5086 if (use == dataref_ptr) 5087 SET_USE (use_p, new_dataref_ptr); 5088 else 5089 gcc_assert (operand_equal_p (use, update, 0)); 5090 } 5091 5092 return new_dataref_ptr; 5093 } 5094 5095 5096 /* Copy memory reference info such as base/clique from the SRC reference 5097 to the DEST MEM_REF. */ 5098 5099 void 5100 vect_copy_ref_info (tree dest, tree src) 5101 { 5102 if (TREE_CODE (dest) != MEM_REF) 5103 return; 5104 5105 tree src_base = src; 5106 while (handled_component_p (src_base)) 5107 src_base = TREE_OPERAND (src_base, 0); 5108 if (TREE_CODE (src_base) != MEM_REF 5109 && TREE_CODE (src_base) != TARGET_MEM_REF) 5110 return; 5111 5112 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 5113 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 5114 } 5115 5116 5117 /* Function vect_create_destination_var. 5118 5119 Create a new temporary of type VECTYPE. */ 5120 5121 tree 5122 vect_create_destination_var (tree scalar_dest, tree vectype) 5123 { 5124 tree vec_dest; 5125 const char *name; 5126 char *new_name; 5127 tree type; 5128 enum vect_var_kind kind; 5129 5130 kind = vectype 5131 ? VECTOR_BOOLEAN_TYPE_P (vectype) 5132 ? vect_mask_var 5133 : vect_simple_var 5134 : vect_scalar_var; 5135 type = vectype ? vectype : TREE_TYPE (scalar_dest); 5136 5137 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 5138 5139 name = get_name (scalar_dest); 5140 if (name) 5141 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 5142 else 5143 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 5144 vec_dest = vect_get_new_vect_var (type, kind, new_name); 5145 free (new_name); 5146 5147 return vec_dest; 5148 } 5149 5150 /* Function vect_grouped_store_supported. 5151 5152 Returns TRUE if interleave high and interleave low permutations 5153 are supported, and FALSE otherwise. */ 5154 5155 bool 5156 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 5157 { 5158 machine_mode mode = TYPE_MODE (vectype); 5159 5160 /* vect_permute_store_chain requires the group size to be equal to 3 or 5161 be a power of two. */ 5162 if (count != 3 && exact_log2 (count) == -1) 5163 { 5164 if (dump_enabled_p ()) 5165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5166 "the size of the group of accesses" 5167 " is not a power of 2 or not eqaul to 3\n"); 5168 return false; 5169 } 5170 5171 /* Check that the permutation is supported. */ 5172 if (VECTOR_MODE_P (mode)) 5173 { 5174 unsigned int i; 5175 if (count == 3) 5176 { 5177 unsigned int j0 = 0, j1 = 0, j2 = 0; 5178 unsigned int i, j; 5179 5180 unsigned int nelt; 5181 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5182 { 5183 if (dump_enabled_p ()) 5184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5185 "cannot handle groups of 3 stores for" 5186 " variable-length vectors\n"); 5187 return false; 5188 } 5189 5190 vec_perm_builder sel (nelt, nelt, 1); 5191 sel.quick_grow (nelt); 5192 vec_perm_indices indices; 5193 for (j = 0; j < 3; j++) 5194 { 5195 int nelt0 = ((3 - j) * nelt) % 3; 5196 int nelt1 = ((3 - j) * nelt + 1) % 3; 5197 int nelt2 = ((3 - j) * nelt + 2) % 3; 5198 for (i = 0; i < nelt; i++) 5199 { 5200 if (3 * i + nelt0 < nelt) 5201 sel[3 * i + nelt0] = j0++; 5202 if (3 * i + nelt1 < nelt) 5203 sel[3 * i + nelt1] = nelt + j1++; 5204 if (3 * i + nelt2 < nelt) 5205 sel[3 * i + nelt2] = 0; 5206 } 5207 indices.new_vector (sel, 2, nelt); 5208 if (!can_vec_perm_const_p (mode, indices)) 5209 { 5210 if (dump_enabled_p ()) 5211 dump_printf (MSG_MISSED_OPTIMIZATION, 5212 "permutation op not supported by target.\n"); 5213 return false; 5214 } 5215 5216 for (i = 0; i < nelt; i++) 5217 { 5218 if (3 * i + nelt0 < nelt) 5219 sel[3 * i + nelt0] = 3 * i + nelt0; 5220 if (3 * i + nelt1 < nelt) 5221 sel[3 * i + nelt1] = 3 * i + nelt1; 5222 if (3 * i + nelt2 < nelt) 5223 sel[3 * i + nelt2] = nelt + j2++; 5224 } 5225 indices.new_vector (sel, 2, nelt); 5226 if (!can_vec_perm_const_p (mode, indices)) 5227 { 5228 if (dump_enabled_p ()) 5229 dump_printf (MSG_MISSED_OPTIMIZATION, 5230 "permutation op not supported by target.\n"); 5231 return false; 5232 } 5233 } 5234 return true; 5235 } 5236 else 5237 { 5238 /* If length is not equal to 3 then only power of 2 is supported. */ 5239 gcc_assert (pow2p_hwi (count)); 5240 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5241 5242 /* The encoding has 2 interleaved stepped patterns. */ 5243 vec_perm_builder sel (nelt, 2, 3); 5244 sel.quick_grow (6); 5245 for (i = 0; i < 3; i++) 5246 { 5247 sel[i * 2] = i; 5248 sel[i * 2 + 1] = i + nelt; 5249 } 5250 vec_perm_indices indices (sel, 2, nelt); 5251 if (can_vec_perm_const_p (mode, indices)) 5252 { 5253 for (i = 0; i < 6; i++) 5254 sel[i] += exact_div (nelt, 2); 5255 indices.new_vector (sel, 2, nelt); 5256 if (can_vec_perm_const_p (mode, indices)) 5257 return true; 5258 } 5259 } 5260 } 5261 5262 if (dump_enabled_p ()) 5263 dump_printf (MSG_MISSED_OPTIMIZATION, 5264 "permutaion op not supported by target.\n"); 5265 return false; 5266 } 5267 5268 5269 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5270 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5271 5272 bool 5273 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5274 bool masked_p) 5275 { 5276 if (masked_p) 5277 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5278 vec_mask_store_lanes_optab, 5279 vectype, count); 5280 else 5281 return vect_lanes_optab_supported_p ("vec_store_lanes", 5282 vec_store_lanes_optab, 5283 vectype, count); 5284 } 5285 5286 5287 /* Function vect_permute_store_chain. 5288 5289 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5290 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5291 the data correctly for the stores. Return the final references for stores 5292 in RESULT_CHAIN. 5293 5294 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5295 The input is 4 vectors each containing 8 elements. We assign a number to 5296 each element, the input sequence is: 5297 5298 1st vec: 0 1 2 3 4 5 6 7 5299 2nd vec: 8 9 10 11 12 13 14 15 5300 3rd vec: 16 17 18 19 20 21 22 23 5301 4th vec: 24 25 26 27 28 29 30 31 5302 5303 The output sequence should be: 5304 5305 1st vec: 0 8 16 24 1 9 17 25 5306 2nd vec: 2 10 18 26 3 11 19 27 5307 3rd vec: 4 12 20 28 5 13 21 30 5308 4th vec: 6 14 22 30 7 15 23 31 5309 5310 i.e., we interleave the contents of the four vectors in their order. 5311 5312 We use interleave_high/low instructions to create such output. The input of 5313 each interleave_high/low operation is two vectors: 5314 1st vec 2nd vec 5315 0 1 2 3 4 5 6 7 5316 the even elements of the result vector are obtained left-to-right from the 5317 high/low elements of the first vector. The odd elements of the result are 5318 obtained left-to-right from the high/low elements of the second vector. 5319 The output of interleave_high will be: 0 4 1 5 5320 and of interleave_low: 2 6 3 7 5321 5322 5323 The permutation is done in log LENGTH stages. In each stage interleave_high 5324 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5325 where the first argument is taken from the first half of DR_CHAIN and the 5326 second argument from it's second half. 5327 In our example, 5328 5329 I1: interleave_high (1st vec, 3rd vec) 5330 I2: interleave_low (1st vec, 3rd vec) 5331 I3: interleave_high (2nd vec, 4th vec) 5332 I4: interleave_low (2nd vec, 4th vec) 5333 5334 The output for the first stage is: 5335 5336 I1: 0 16 1 17 2 18 3 19 5337 I2: 4 20 5 21 6 22 7 23 5338 I3: 8 24 9 25 10 26 11 27 5339 I4: 12 28 13 29 14 30 15 31 5340 5341 The output of the second stage, i.e. the final result is: 5342 5343 I1: 0 8 16 24 1 9 17 25 5344 I2: 2 10 18 26 3 11 19 27 5345 I3: 4 12 20 28 5 13 21 30 5346 I4: 6 14 22 30 7 15 23 31. */ 5347 5348 void 5349 vect_permute_store_chain (vec<tree> dr_chain, 5350 unsigned int length, 5351 gimple *stmt, 5352 gimple_stmt_iterator *gsi, 5353 vec<tree> *result_chain) 5354 { 5355 tree vect1, vect2, high, low; 5356 gimple *perm_stmt; 5357 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5358 tree perm_mask_low, perm_mask_high; 5359 tree data_ref; 5360 tree perm3_mask_low, perm3_mask_high; 5361 unsigned int i, j, n, log_length = exact_log2 (length); 5362 5363 result_chain->quick_grow (length); 5364 memcpy (result_chain->address (), dr_chain.address (), 5365 length * sizeof (tree)); 5366 5367 if (length == 3) 5368 { 5369 /* vect_grouped_store_supported ensures that this is constant. */ 5370 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5371 unsigned int j0 = 0, j1 = 0, j2 = 0; 5372 5373 vec_perm_builder sel (nelt, nelt, 1); 5374 sel.quick_grow (nelt); 5375 vec_perm_indices indices; 5376 for (j = 0; j < 3; j++) 5377 { 5378 int nelt0 = ((3 - j) * nelt) % 3; 5379 int nelt1 = ((3 - j) * nelt + 1) % 3; 5380 int nelt2 = ((3 - j) * nelt + 2) % 3; 5381 5382 for (i = 0; i < nelt; i++) 5383 { 5384 if (3 * i + nelt0 < nelt) 5385 sel[3 * i + nelt0] = j0++; 5386 if (3 * i + nelt1 < nelt) 5387 sel[3 * i + nelt1] = nelt + j1++; 5388 if (3 * i + nelt2 < nelt) 5389 sel[3 * i + nelt2] = 0; 5390 } 5391 indices.new_vector (sel, 2, nelt); 5392 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5393 5394 for (i = 0; i < nelt; i++) 5395 { 5396 if (3 * i + nelt0 < nelt) 5397 sel[3 * i + nelt0] = 3 * i + nelt0; 5398 if (3 * i + nelt1 < nelt) 5399 sel[3 * i + nelt1] = 3 * i + nelt1; 5400 if (3 * i + nelt2 < nelt) 5401 sel[3 * i + nelt2] = nelt + j2++; 5402 } 5403 indices.new_vector (sel, 2, nelt); 5404 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5405 5406 vect1 = dr_chain[0]; 5407 vect2 = dr_chain[1]; 5408 5409 /* Create interleaving stmt: 5410 low = VEC_PERM_EXPR <vect1, vect2, 5411 {j, nelt, *, j + 1, nelt + j + 1, *, 5412 j + 2, nelt + j + 2, *, ...}> */ 5413 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5414 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5415 vect2, perm3_mask_low); 5416 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5417 5418 vect1 = data_ref; 5419 vect2 = dr_chain[2]; 5420 /* Create interleaving stmt: 5421 low = VEC_PERM_EXPR <vect1, vect2, 5422 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5423 6, 7, nelt + j + 2, ...}> */ 5424 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5425 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5426 vect2, perm3_mask_high); 5427 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5428 (*result_chain)[j] = data_ref; 5429 } 5430 } 5431 else 5432 { 5433 /* If length is not equal to 3 then only power of 2 is supported. */ 5434 gcc_assert (pow2p_hwi (length)); 5435 5436 /* The encoding has 2 interleaved stepped patterns. */ 5437 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5438 vec_perm_builder sel (nelt, 2, 3); 5439 sel.quick_grow (6); 5440 for (i = 0; i < 3; i++) 5441 { 5442 sel[i * 2] = i; 5443 sel[i * 2 + 1] = i + nelt; 5444 } 5445 vec_perm_indices indices (sel, 2, nelt); 5446 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5447 5448 for (i = 0; i < 6; i++) 5449 sel[i] += exact_div (nelt, 2); 5450 indices.new_vector (sel, 2, nelt); 5451 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5452 5453 for (i = 0, n = log_length; i < n; i++) 5454 { 5455 for (j = 0; j < length/2; j++) 5456 { 5457 vect1 = dr_chain[j]; 5458 vect2 = dr_chain[j+length/2]; 5459 5460 /* Create interleaving stmt: 5461 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5462 ...}> */ 5463 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5464 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5465 vect2, perm_mask_high); 5466 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5467 (*result_chain)[2*j] = high; 5468 5469 /* Create interleaving stmt: 5470 low = VEC_PERM_EXPR <vect1, vect2, 5471 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5472 ...}> */ 5473 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5474 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5475 vect2, perm_mask_low); 5476 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5477 (*result_chain)[2*j+1] = low; 5478 } 5479 memcpy (dr_chain.address (), result_chain->address (), 5480 length * sizeof (tree)); 5481 } 5482 } 5483 } 5484 5485 /* Function vect_setup_realignment 5486 5487 This function is called when vectorizing an unaligned load using 5488 the dr_explicit_realign[_optimized] scheme. 5489 This function generates the following code at the loop prolog: 5490 5491 p = initial_addr; 5492 x msq_init = *(floor(p)); # prolog load 5493 realignment_token = call target_builtin; 5494 loop: 5495 x msq = phi (msq_init, ---) 5496 5497 The stmts marked with x are generated only for the case of 5498 dr_explicit_realign_optimized. 5499 5500 The code above sets up a new (vector) pointer, pointing to the first 5501 location accessed by STMT, and a "floor-aligned" load using that pointer. 5502 It also generates code to compute the "realignment-token" (if the relevant 5503 target hook was defined), and creates a phi-node at the loop-header bb 5504 whose arguments are the result of the prolog-load (created by this 5505 function) and the result of a load that takes place in the loop (to be 5506 created by the caller to this function). 5507 5508 For the case of dr_explicit_realign_optimized: 5509 The caller to this function uses the phi-result (msq) to create the 5510 realignment code inside the loop, and sets up the missing phi argument, 5511 as follows: 5512 loop: 5513 msq = phi (msq_init, lsq) 5514 lsq = *(floor(p')); # load in loop 5515 result = realign_load (msq, lsq, realignment_token); 5516 5517 For the case of dr_explicit_realign: 5518 loop: 5519 msq = *(floor(p)); # load in loop 5520 p' = p + (VS-1); 5521 lsq = *(floor(p')); # load in loop 5522 result = realign_load (msq, lsq, realignment_token); 5523 5524 Input: 5525 STMT - (scalar) load stmt to be vectorized. This load accesses 5526 a memory location that may be unaligned. 5527 BSI - place where new code is to be inserted. 5528 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5529 is used. 5530 5531 Output: 5532 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5533 target hook, if defined. 5534 Return value - the result of the loop-header phi node. */ 5535 5536 tree 5537 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5538 tree *realignment_token, 5539 enum dr_alignment_support alignment_support_scheme, 5540 tree init_addr, 5541 struct loop **at_loop) 5542 { 5543 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5544 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5545 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5546 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5547 struct loop *loop = NULL; 5548 edge pe = NULL; 5549 tree scalar_dest = gimple_assign_lhs (stmt); 5550 tree vec_dest; 5551 gimple *inc; 5552 tree ptr; 5553 tree data_ref; 5554 basic_block new_bb; 5555 tree msq_init = NULL_TREE; 5556 tree new_temp; 5557 gphi *phi_stmt; 5558 tree msq = NULL_TREE; 5559 gimple_seq stmts = NULL; 5560 bool inv_p; 5561 bool compute_in_loop = false; 5562 bool nested_in_vect_loop = false; 5563 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5564 struct loop *loop_for_initial_load = NULL; 5565 5566 if (loop_vinfo) 5567 { 5568 loop = LOOP_VINFO_LOOP (loop_vinfo); 5569 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5570 } 5571 5572 gcc_assert (alignment_support_scheme == dr_explicit_realign 5573 || alignment_support_scheme == dr_explicit_realign_optimized); 5574 5575 /* We need to generate three things: 5576 1. the misalignment computation 5577 2. the extra vector load (for the optimized realignment scheme). 5578 3. the phi node for the two vectors from which the realignment is 5579 done (for the optimized realignment scheme). */ 5580 5581 /* 1. Determine where to generate the misalignment computation. 5582 5583 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5584 calculation will be generated by this function, outside the loop (in the 5585 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5586 caller, inside the loop. 5587 5588 Background: If the misalignment remains fixed throughout the iterations of 5589 the loop, then both realignment schemes are applicable, and also the 5590 misalignment computation can be done outside LOOP. This is because we are 5591 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5592 are a multiple of VS (the Vector Size), and therefore the misalignment in 5593 different vectorized LOOP iterations is always the same. 5594 The problem arises only if the memory access is in an inner-loop nested 5595 inside LOOP, which is now being vectorized using outer-loop vectorization. 5596 This is the only case when the misalignment of the memory access may not 5597 remain fixed throughout the iterations of the inner-loop (as explained in 5598 detail in vect_supportable_dr_alignment). In this case, not only is the 5599 optimized realignment scheme not applicable, but also the misalignment 5600 computation (and generation of the realignment token that is passed to 5601 REALIGN_LOAD) have to be done inside the loop. 5602 5603 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5604 or not, which in turn determines if the misalignment is computed inside 5605 the inner-loop, or outside LOOP. */ 5606 5607 if (init_addr != NULL_TREE || !loop_vinfo) 5608 { 5609 compute_in_loop = true; 5610 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5611 } 5612 5613 5614 /* 2. Determine where to generate the extra vector load. 5615 5616 For the optimized realignment scheme, instead of generating two vector 5617 loads in each iteration, we generate a single extra vector load in the 5618 preheader of the loop, and in each iteration reuse the result of the 5619 vector load from the previous iteration. In case the memory access is in 5620 an inner-loop nested inside LOOP, which is now being vectorized using 5621 outer-loop vectorization, we need to determine whether this initial vector 5622 load should be generated at the preheader of the inner-loop, or can be 5623 generated at the preheader of LOOP. If the memory access has no evolution 5624 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5625 to be generated inside LOOP (in the preheader of the inner-loop). */ 5626 5627 if (nested_in_vect_loop) 5628 { 5629 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5630 bool invariant_in_outerloop = 5631 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5632 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5633 } 5634 else 5635 loop_for_initial_load = loop; 5636 if (at_loop) 5637 *at_loop = loop_for_initial_load; 5638 5639 if (loop_for_initial_load) 5640 pe = loop_preheader_edge (loop_for_initial_load); 5641 5642 /* 3. For the case of the optimized realignment, create the first vector 5643 load at the loop preheader. */ 5644 5645 if (alignment_support_scheme == dr_explicit_realign_optimized) 5646 { 5647 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5648 gassign *new_stmt; 5649 5650 gcc_assert (!compute_in_loop); 5651 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5652 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5653 NULL_TREE, &init_addr, NULL, &inc, 5654 true, &inv_p); 5655 if (TREE_CODE (ptr) == SSA_NAME) 5656 new_temp = copy_ssa_name (ptr); 5657 else 5658 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5659 unsigned int align = DR_TARGET_ALIGNMENT (dr); 5660 new_stmt = gimple_build_assign 5661 (new_temp, BIT_AND_EXPR, ptr, 5662 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); 5663 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5664 gcc_assert (!new_bb); 5665 data_ref 5666 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5667 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5668 vect_copy_ref_info (data_ref, DR_REF (dr)); 5669 new_stmt = gimple_build_assign (vec_dest, data_ref); 5670 new_temp = make_ssa_name (vec_dest, new_stmt); 5671 gimple_assign_set_lhs (new_stmt, new_temp); 5672 if (pe) 5673 { 5674 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5675 gcc_assert (!new_bb); 5676 } 5677 else 5678 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5679 5680 msq_init = gimple_assign_lhs (new_stmt); 5681 } 5682 5683 /* 4. Create realignment token using a target builtin, if available. 5684 It is done either inside the containing loop, or before LOOP (as 5685 determined above). */ 5686 5687 if (targetm.vectorize.builtin_mask_for_load) 5688 { 5689 gcall *new_stmt; 5690 tree builtin_decl; 5691 5692 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5693 if (!init_addr) 5694 { 5695 /* Generate the INIT_ADDR computation outside LOOP. */ 5696 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5697 NULL_TREE); 5698 if (loop) 5699 { 5700 pe = loop_preheader_edge (loop); 5701 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5702 gcc_assert (!new_bb); 5703 } 5704 else 5705 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5706 } 5707 5708 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5709 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5710 vec_dest = 5711 vect_create_destination_var (scalar_dest, 5712 gimple_call_return_type (new_stmt)); 5713 new_temp = make_ssa_name (vec_dest, new_stmt); 5714 gimple_call_set_lhs (new_stmt, new_temp); 5715 5716 if (compute_in_loop) 5717 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5718 else 5719 { 5720 /* Generate the misalignment computation outside LOOP. */ 5721 pe = loop_preheader_edge (loop); 5722 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5723 gcc_assert (!new_bb); 5724 } 5725 5726 *realignment_token = gimple_call_lhs (new_stmt); 5727 5728 /* The result of the CALL_EXPR to this builtin is determined from 5729 the value of the parameter and no global variables are touched 5730 which makes the builtin a "const" function. Requiring the 5731 builtin to have the "const" attribute makes it unnecessary 5732 to call mark_call_clobbered. */ 5733 gcc_assert (TREE_READONLY (builtin_decl)); 5734 } 5735 5736 if (alignment_support_scheme == dr_explicit_realign) 5737 return msq; 5738 5739 gcc_assert (!compute_in_loop); 5740 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5741 5742 5743 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5744 5745 pe = loop_preheader_edge (containing_loop); 5746 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5747 msq = make_ssa_name (vec_dest); 5748 phi_stmt = create_phi_node (msq, containing_loop->header); 5749 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5750 5751 return msq; 5752 } 5753 5754 5755 /* Function vect_grouped_load_supported. 5756 5757 COUNT is the size of the load group (the number of statements plus the 5758 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5759 only one statement, with a gap of COUNT - 1. 5760 5761 Returns true if a suitable permute exists. */ 5762 5763 bool 5764 vect_grouped_load_supported (tree vectype, bool single_element_p, 5765 unsigned HOST_WIDE_INT count) 5766 { 5767 machine_mode mode = TYPE_MODE (vectype); 5768 5769 /* If this is single-element interleaving with an element distance 5770 that leaves unused vector loads around punt - we at least create 5771 very sub-optimal code in that case (and blow up memory, 5772 see PR65518). */ 5773 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5774 { 5775 if (dump_enabled_p ()) 5776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5777 "single-element interleaving not supported " 5778 "for not adjacent vector loads\n"); 5779 return false; 5780 } 5781 5782 /* vect_permute_load_chain requires the group size to be equal to 3 or 5783 be a power of two. */ 5784 if (count != 3 && exact_log2 (count) == -1) 5785 { 5786 if (dump_enabled_p ()) 5787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5788 "the size of the group of accesses" 5789 " is not a power of 2 or not equal to 3\n"); 5790 return false; 5791 } 5792 5793 /* Check that the permutation is supported. */ 5794 if (VECTOR_MODE_P (mode)) 5795 { 5796 unsigned int i, j; 5797 if (count == 3) 5798 { 5799 unsigned int nelt; 5800 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5801 { 5802 if (dump_enabled_p ()) 5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5804 "cannot handle groups of 3 loads for" 5805 " variable-length vectors\n"); 5806 return false; 5807 } 5808 5809 vec_perm_builder sel (nelt, nelt, 1); 5810 sel.quick_grow (nelt); 5811 vec_perm_indices indices; 5812 unsigned int k; 5813 for (k = 0; k < 3; k++) 5814 { 5815 for (i = 0; i < nelt; i++) 5816 if (3 * i + k < 2 * nelt) 5817 sel[i] = 3 * i + k; 5818 else 5819 sel[i] = 0; 5820 indices.new_vector (sel, 2, nelt); 5821 if (!can_vec_perm_const_p (mode, indices)) 5822 { 5823 if (dump_enabled_p ()) 5824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5825 "shuffle of 3 loads is not supported by" 5826 " target\n"); 5827 return false; 5828 } 5829 for (i = 0, j = 0; i < nelt; i++) 5830 if (3 * i + k < 2 * nelt) 5831 sel[i] = i; 5832 else 5833 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5834 indices.new_vector (sel, 2, nelt); 5835 if (!can_vec_perm_const_p (mode, indices)) 5836 { 5837 if (dump_enabled_p ()) 5838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5839 "shuffle of 3 loads is not supported by" 5840 " target\n"); 5841 return false; 5842 } 5843 } 5844 return true; 5845 } 5846 else 5847 { 5848 /* If length is not equal to 3 then only power of 2 is supported. */ 5849 gcc_assert (pow2p_hwi (count)); 5850 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5851 5852 /* The encoding has a single stepped pattern. */ 5853 vec_perm_builder sel (nelt, 1, 3); 5854 sel.quick_grow (3); 5855 for (i = 0; i < 3; i++) 5856 sel[i] = i * 2; 5857 vec_perm_indices indices (sel, 2, nelt); 5858 if (can_vec_perm_const_p (mode, indices)) 5859 { 5860 for (i = 0; i < 3; i++) 5861 sel[i] = i * 2 + 1; 5862 indices.new_vector (sel, 2, nelt); 5863 if (can_vec_perm_const_p (mode, indices)) 5864 return true; 5865 } 5866 } 5867 } 5868 5869 if (dump_enabled_p ()) 5870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5871 "extract even/odd not supported by target\n"); 5872 return false; 5873 } 5874 5875 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5876 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5877 5878 bool 5879 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5880 bool masked_p) 5881 { 5882 if (masked_p) 5883 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5884 vec_mask_load_lanes_optab, 5885 vectype, count); 5886 else 5887 return vect_lanes_optab_supported_p ("vec_load_lanes", 5888 vec_load_lanes_optab, 5889 vectype, count); 5890 } 5891 5892 /* Function vect_permute_load_chain. 5893 5894 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5895 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5896 the input data correctly. Return the final references for loads in 5897 RESULT_CHAIN. 5898 5899 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5900 The input is 4 vectors each containing 8 elements. We assign a number to each 5901 element, the input sequence is: 5902 5903 1st vec: 0 1 2 3 4 5 6 7 5904 2nd vec: 8 9 10 11 12 13 14 15 5905 3rd vec: 16 17 18 19 20 21 22 23 5906 4th vec: 24 25 26 27 28 29 30 31 5907 5908 The output sequence should be: 5909 5910 1st vec: 0 4 8 12 16 20 24 28 5911 2nd vec: 1 5 9 13 17 21 25 29 5912 3rd vec: 2 6 10 14 18 22 26 30 5913 4th vec: 3 7 11 15 19 23 27 31 5914 5915 i.e., the first output vector should contain the first elements of each 5916 interleaving group, etc. 5917 5918 We use extract_even/odd instructions to create such output. The input of 5919 each extract_even/odd operation is two vectors 5920 1st vec 2nd vec 5921 0 1 2 3 4 5 6 7 5922 5923 and the output is the vector of extracted even/odd elements. The output of 5924 extract_even will be: 0 2 4 6 5925 and of extract_odd: 1 3 5 7 5926 5927 5928 The permutation is done in log LENGTH stages. In each stage extract_even 5929 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5930 their order. In our example, 5931 5932 E1: extract_even (1st vec, 2nd vec) 5933 E2: extract_odd (1st vec, 2nd vec) 5934 E3: extract_even (3rd vec, 4th vec) 5935 E4: extract_odd (3rd vec, 4th vec) 5936 5937 The output for the first stage will be: 5938 5939 E1: 0 2 4 6 8 10 12 14 5940 E2: 1 3 5 7 9 11 13 15 5941 E3: 16 18 20 22 24 26 28 30 5942 E4: 17 19 21 23 25 27 29 31 5943 5944 In order to proceed and create the correct sequence for the next stage (or 5945 for the correct output, if the second stage is the last one, as in our 5946 example), we first put the output of extract_even operation and then the 5947 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5948 The input for the second stage is: 5949 5950 1st vec (E1): 0 2 4 6 8 10 12 14 5951 2nd vec (E3): 16 18 20 22 24 26 28 30 5952 3rd vec (E2): 1 3 5 7 9 11 13 15 5953 4th vec (E4): 17 19 21 23 25 27 29 31 5954 5955 The output of the second stage: 5956 5957 E1: 0 4 8 12 16 20 24 28 5958 E2: 2 6 10 14 18 22 26 30 5959 E3: 1 5 9 13 17 21 25 29 5960 E4: 3 7 11 15 19 23 27 31 5961 5962 And RESULT_CHAIN after reordering: 5963 5964 1st vec (E1): 0 4 8 12 16 20 24 28 5965 2nd vec (E3): 1 5 9 13 17 21 25 29 5966 3rd vec (E2): 2 6 10 14 18 22 26 30 5967 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5968 5969 static void 5970 vect_permute_load_chain (vec<tree> dr_chain, 5971 unsigned int length, 5972 gimple *stmt, 5973 gimple_stmt_iterator *gsi, 5974 vec<tree> *result_chain) 5975 { 5976 tree data_ref, first_vect, second_vect; 5977 tree perm_mask_even, perm_mask_odd; 5978 tree perm3_mask_low, perm3_mask_high; 5979 gimple *perm_stmt; 5980 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5981 unsigned int i, j, log_length = exact_log2 (length); 5982 5983 result_chain->quick_grow (length); 5984 memcpy (result_chain->address (), dr_chain.address (), 5985 length * sizeof (tree)); 5986 5987 if (length == 3) 5988 { 5989 /* vect_grouped_load_supported ensures that this is constant. */ 5990 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5991 unsigned int k; 5992 5993 vec_perm_builder sel (nelt, nelt, 1); 5994 sel.quick_grow (nelt); 5995 vec_perm_indices indices; 5996 for (k = 0; k < 3; k++) 5997 { 5998 for (i = 0; i < nelt; i++) 5999 if (3 * i + k < 2 * nelt) 6000 sel[i] = 3 * i + k; 6001 else 6002 sel[i] = 0; 6003 indices.new_vector (sel, 2, nelt); 6004 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 6005 6006 for (i = 0, j = 0; i < nelt; i++) 6007 if (3 * i + k < 2 * nelt) 6008 sel[i] = i; 6009 else 6010 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 6011 indices.new_vector (sel, 2, nelt); 6012 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 6013 6014 first_vect = dr_chain[0]; 6015 second_vect = dr_chain[1]; 6016 6017 /* Create interleaving stmt (low part of): 6018 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 6019 ...}> */ 6020 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 6021 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 6022 second_vect, perm3_mask_low); 6023 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6024 6025 /* Create interleaving stmt (high part of): 6026 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 6027 ...}> */ 6028 first_vect = data_ref; 6029 second_vect = dr_chain[2]; 6030 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 6031 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 6032 second_vect, perm3_mask_high); 6033 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6034 (*result_chain)[k] = data_ref; 6035 } 6036 } 6037 else 6038 { 6039 /* If length is not equal to 3 then only power of 2 is supported. */ 6040 gcc_assert (pow2p_hwi (length)); 6041 6042 /* The encoding has a single stepped pattern. */ 6043 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 6044 vec_perm_builder sel (nelt, 1, 3); 6045 sel.quick_grow (3); 6046 for (i = 0; i < 3; ++i) 6047 sel[i] = i * 2; 6048 vec_perm_indices indices (sel, 2, nelt); 6049 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 6050 6051 for (i = 0; i < 3; ++i) 6052 sel[i] = i * 2 + 1; 6053 indices.new_vector (sel, 2, nelt); 6054 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 6055 6056 for (i = 0; i < log_length; i++) 6057 { 6058 for (j = 0; j < length; j += 2) 6059 { 6060 first_vect = dr_chain[j]; 6061 second_vect = dr_chain[j+1]; 6062 6063 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 6064 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 6065 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6066 first_vect, second_vect, 6067 perm_mask_even); 6068 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6069 (*result_chain)[j/2] = data_ref; 6070 6071 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 6072 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 6073 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6074 first_vect, second_vect, 6075 perm_mask_odd); 6076 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6077 (*result_chain)[j/2+length/2] = data_ref; 6078 } 6079 memcpy (dr_chain.address (), result_chain->address (), 6080 length * sizeof (tree)); 6081 } 6082 } 6083 } 6084 6085 /* Function vect_shift_permute_load_chain. 6086 6087 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 6088 sequence of stmts to reorder the input data accordingly. 6089 Return the final references for loads in RESULT_CHAIN. 6090 Return true if successed, false otherwise. 6091 6092 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 6093 The input is 3 vectors each containing 8 elements. We assign a 6094 number to each element, the input sequence is: 6095 6096 1st vec: 0 1 2 3 4 5 6 7 6097 2nd vec: 8 9 10 11 12 13 14 15 6098 3rd vec: 16 17 18 19 20 21 22 23 6099 6100 The output sequence should be: 6101 6102 1st vec: 0 3 6 9 12 15 18 21 6103 2nd vec: 1 4 7 10 13 16 19 22 6104 3rd vec: 2 5 8 11 14 17 20 23 6105 6106 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 6107 6108 First we shuffle all 3 vectors to get correct elements order: 6109 6110 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 6111 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 6112 3rd vec: (16 19 22) (17 20 23) (18 21) 6113 6114 Next we unite and shift vector 3 times: 6115 6116 1st step: 6117 shift right by 6 the concatenation of: 6118 "1st vec" and "2nd vec" 6119 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 6120 "2nd vec" and "3rd vec" 6121 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 6122 "3rd vec" and "1st vec" 6123 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 6124 | New vectors | 6125 6126 So that now new vectors are: 6127 6128 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 6129 2nd vec: (10 13) (16 19 22) (17 20 23) 6130 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 6131 6132 2nd step: 6133 shift right by 5 the concatenation of: 6134 "1st vec" and "3rd vec" 6135 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 6136 "2nd vec" and "1st vec" 6137 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 6138 "3rd vec" and "2nd vec" 6139 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 6140 | New vectors | 6141 6142 So that now new vectors are: 6143 6144 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 6145 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 6146 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 6147 6148 3rd step: 6149 shift right by 5 the concatenation of: 6150 "1st vec" and "1st vec" 6151 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 6152 shift right by 3 the concatenation of: 6153 "2nd vec" and "2nd vec" 6154 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 6155 | New vectors | 6156 6157 So that now all vectors are READY: 6158 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6159 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6160 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6161 6162 This algorithm is faster than one in vect_permute_load_chain if: 6163 1. "shift of a concatination" is faster than general permutation. 6164 This is usually so. 6165 2. The TARGET machine can't execute vector instructions in parallel. 6166 This is because each step of the algorithm depends on previous. 6167 The algorithm in vect_permute_load_chain is much more parallel. 6168 6169 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6170 */ 6171 6172 static bool 6173 vect_shift_permute_load_chain (vec<tree> dr_chain, 6174 unsigned int length, 6175 gimple *stmt, 6176 gimple_stmt_iterator *gsi, 6177 vec<tree> *result_chain) 6178 { 6179 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6180 tree perm2_mask1, perm2_mask2, perm3_mask; 6181 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6182 gimple *perm_stmt; 6183 6184 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 6185 unsigned int i; 6186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6187 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6188 6189 unsigned HOST_WIDE_INT nelt, vf; 6190 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6191 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6192 /* Not supported for variable-length vectors. */ 6193 return false; 6194 6195 vec_perm_builder sel (nelt, nelt, 1); 6196 sel.quick_grow (nelt); 6197 6198 result_chain->quick_grow (length); 6199 memcpy (result_chain->address (), dr_chain.address (), 6200 length * sizeof (tree)); 6201 6202 if (pow2p_hwi (length) && vf > 4) 6203 { 6204 unsigned int j, log_length = exact_log2 (length); 6205 for (i = 0; i < nelt / 2; ++i) 6206 sel[i] = i * 2; 6207 for (i = 0; i < nelt / 2; ++i) 6208 sel[nelt / 2 + i] = i * 2 + 1; 6209 vec_perm_indices indices (sel, 2, nelt); 6210 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6211 { 6212 if (dump_enabled_p ()) 6213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6214 "shuffle of 2 fields structure is not \ 6215 supported by target\n"); 6216 return false; 6217 } 6218 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6219 6220 for (i = 0; i < nelt / 2; ++i) 6221 sel[i] = i * 2 + 1; 6222 for (i = 0; i < nelt / 2; ++i) 6223 sel[nelt / 2 + i] = i * 2; 6224 indices.new_vector (sel, 2, nelt); 6225 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6226 { 6227 if (dump_enabled_p ()) 6228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6229 "shuffle of 2 fields structure is not \ 6230 supported by target\n"); 6231 return false; 6232 } 6233 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6234 6235 /* Generating permutation constant to shift all elements. 6236 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6237 for (i = 0; i < nelt; i++) 6238 sel[i] = nelt / 2 + i; 6239 indices.new_vector (sel, 2, nelt); 6240 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6241 { 6242 if (dump_enabled_p ()) 6243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6244 "shift permutation is not supported by target\n"); 6245 return false; 6246 } 6247 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6248 6249 /* Generating permutation constant to select vector from 2. 6250 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6251 for (i = 0; i < nelt / 2; i++) 6252 sel[i] = i; 6253 for (i = nelt / 2; i < nelt; i++) 6254 sel[i] = nelt + i; 6255 indices.new_vector (sel, 2, nelt); 6256 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6257 { 6258 if (dump_enabled_p ()) 6259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6260 "select is not supported by target\n"); 6261 return false; 6262 } 6263 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6264 6265 for (i = 0; i < log_length; i++) 6266 { 6267 for (j = 0; j < length; j += 2) 6268 { 6269 first_vect = dr_chain[j]; 6270 second_vect = dr_chain[j + 1]; 6271 6272 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6273 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6274 first_vect, first_vect, 6275 perm2_mask1); 6276 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6277 vect[0] = data_ref; 6278 6279 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6280 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6281 second_vect, second_vect, 6282 perm2_mask2); 6283 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6284 vect[1] = data_ref; 6285 6286 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6287 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6288 vect[0], vect[1], shift1_mask); 6289 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6290 (*result_chain)[j/2 + length/2] = data_ref; 6291 6292 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6293 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6294 vect[0], vect[1], select_mask); 6295 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6296 (*result_chain)[j/2] = data_ref; 6297 } 6298 memcpy (dr_chain.address (), result_chain->address (), 6299 length * sizeof (tree)); 6300 } 6301 return true; 6302 } 6303 if (length == 3 && vf > 2) 6304 { 6305 unsigned int k = 0, l = 0; 6306 6307 /* Generating permutation constant to get all elements in rigth order. 6308 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6309 for (i = 0; i < nelt; i++) 6310 { 6311 if (3 * k + (l % 3) >= nelt) 6312 { 6313 k = 0; 6314 l += (3 - (nelt % 3)); 6315 } 6316 sel[i] = 3 * k + (l % 3); 6317 k++; 6318 } 6319 vec_perm_indices indices (sel, 2, nelt); 6320 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6321 { 6322 if (dump_enabled_p ()) 6323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6324 "shuffle of 3 fields structure is not \ 6325 supported by target\n"); 6326 return false; 6327 } 6328 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6329 6330 /* Generating permutation constant to shift all elements. 6331 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6332 for (i = 0; i < nelt; i++) 6333 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6334 indices.new_vector (sel, 2, nelt); 6335 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6336 { 6337 if (dump_enabled_p ()) 6338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6339 "shift permutation is not supported by target\n"); 6340 return false; 6341 } 6342 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6343 6344 /* Generating permutation constant to shift all elements. 6345 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6346 for (i = 0; i < nelt; i++) 6347 sel[i] = 2 * (nelt / 3) + 1 + i; 6348 indices.new_vector (sel, 2, nelt); 6349 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6350 { 6351 if (dump_enabled_p ()) 6352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6353 "shift permutation is not supported by target\n"); 6354 return false; 6355 } 6356 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6357 6358 /* Generating permutation constant to shift all elements. 6359 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6360 for (i = 0; i < nelt; i++) 6361 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6362 indices.new_vector (sel, 2, nelt); 6363 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6364 { 6365 if (dump_enabled_p ()) 6366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6367 "shift permutation is not supported by target\n"); 6368 return false; 6369 } 6370 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6371 6372 /* Generating permutation constant to shift all elements. 6373 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6374 for (i = 0; i < nelt; i++) 6375 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6376 indices.new_vector (sel, 2, nelt); 6377 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6378 { 6379 if (dump_enabled_p ()) 6380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6381 "shift permutation is not supported by target\n"); 6382 return false; 6383 } 6384 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6385 6386 for (k = 0; k < 3; k++) 6387 { 6388 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6389 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6390 dr_chain[k], dr_chain[k], 6391 perm3_mask); 6392 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6393 vect[k] = data_ref; 6394 } 6395 6396 for (k = 0; k < 3; k++) 6397 { 6398 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6399 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6400 vect[k % 3], vect[(k + 1) % 3], 6401 shift1_mask); 6402 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6403 vect_shift[k] = data_ref; 6404 } 6405 6406 for (k = 0; k < 3; k++) 6407 { 6408 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6409 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6410 vect_shift[(4 - k) % 3], 6411 vect_shift[(3 - k) % 3], 6412 shift2_mask); 6413 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6414 vect[k] = data_ref; 6415 } 6416 6417 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6418 6419 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6420 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6421 vect[0], shift3_mask); 6422 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6423 (*result_chain)[nelt % 3] = data_ref; 6424 6425 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6426 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6427 vect[1], shift4_mask); 6428 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6429 (*result_chain)[0] = data_ref; 6430 return true; 6431 } 6432 return false; 6433 } 6434 6435 /* Function vect_transform_grouped_load. 6436 6437 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6438 to perform their permutation and ascribe the result vectorized statements to 6439 the scalar statements. 6440 */ 6441 6442 void 6443 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 6444 gimple_stmt_iterator *gsi) 6445 { 6446 machine_mode mode; 6447 vec<tree> result_chain = vNULL; 6448 6449 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6450 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6451 vectors, that are ready for vector computation. */ 6452 result_chain.create (size); 6453 6454 /* If reassociation width for vector type is 2 or greater target machine can 6455 execute 2 or more vector instructions in parallel. Otherwise try to 6456 get chain for loads group using vect_shift_permute_load_chain. */ 6457 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 6458 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6459 || pow2p_hwi (size) 6460 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 6461 gsi, &result_chain)) 6462 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 6463 vect_record_grouped_load_vectors (stmt, result_chain); 6464 result_chain.release (); 6465 } 6466 6467 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6468 generated as part of the vectorization of STMT. Assign the statement 6469 for each vector to the associated scalar statement. */ 6470 6471 void 6472 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6473 { 6474 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6475 gimple *next_stmt, *new_stmt; 6476 unsigned int i, gap_count; 6477 tree tmp_data_ref; 6478 6479 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6480 Since we scan the chain starting from it's first node, their order 6481 corresponds the order of data-refs in RESULT_CHAIN. */ 6482 next_stmt = first_stmt; 6483 gap_count = 1; 6484 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6485 { 6486 if (!next_stmt) 6487 break; 6488 6489 /* Skip the gaps. Loads created for the gaps will be removed by dead 6490 code elimination pass later. No need to check for the first stmt in 6491 the group, since it always exists. 6492 GROUP_GAP is the number of steps in elements from the previous 6493 access (if there is no gap GROUP_GAP is 1). We skip loads that 6494 correspond to the gaps. */ 6495 if (next_stmt != first_stmt 6496 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6497 { 6498 gap_count++; 6499 continue; 6500 } 6501 6502 while (next_stmt) 6503 { 6504 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6505 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6506 copies, and we put the new vector statement in the first available 6507 RELATED_STMT. */ 6508 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6509 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6510 else 6511 { 6512 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6513 { 6514 gimple *prev_stmt = 6515 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6516 gimple *rel_stmt = 6517 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6518 while (rel_stmt) 6519 { 6520 prev_stmt = rel_stmt; 6521 rel_stmt = 6522 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6523 } 6524 6525 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6526 new_stmt; 6527 } 6528 } 6529 6530 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6531 gap_count = 1; 6532 /* If NEXT_STMT accesses the same DR as the previous statement, 6533 put the same TMP_DATA_REF as its vectorized statement; otherwise 6534 get the next data-ref from RESULT_CHAIN. */ 6535 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6536 break; 6537 } 6538 } 6539 } 6540 6541 /* Function vect_force_dr_alignment_p. 6542 6543 Returns whether the alignment of a DECL can be forced to be aligned 6544 on ALIGNMENT bit boundary. */ 6545 6546 bool 6547 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6548 { 6549 if (!VAR_P (decl)) 6550 return false; 6551 6552 if (decl_in_symtab_p (decl) 6553 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6554 return false; 6555 6556 if (TREE_STATIC (decl)) 6557 return (alignment <= MAX_OFILE_ALIGNMENT); 6558 else 6559 return (alignment <= MAX_STACK_ALIGNMENT); 6560 } 6561 6562 6563 /* Return whether the data reference DR is supported with respect to its 6564 alignment. 6565 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6566 it is aligned, i.e., check if it is possible to vectorize it with different 6567 alignment. */ 6568 6569 enum dr_alignment_support 6570 vect_supportable_dr_alignment (struct data_reference *dr, 6571 bool check_aligned_accesses) 6572 { 6573 gimple *stmt = DR_STMT (dr); 6574 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6575 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6576 machine_mode mode = TYPE_MODE (vectype); 6577 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6578 struct loop *vect_loop = NULL; 6579 bool nested_in_vect_loop = false; 6580 6581 if (aligned_access_p (dr) && !check_aligned_accesses) 6582 return dr_aligned; 6583 6584 /* For now assume all conditional loads/stores support unaligned 6585 access without any special code. */ 6586 if (is_gimple_call (stmt) 6587 && gimple_call_internal_p (stmt) 6588 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6589 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6590 return dr_unaligned_supported; 6591 6592 if (loop_vinfo) 6593 { 6594 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6595 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6596 } 6597 6598 /* Possibly unaligned access. */ 6599 6600 /* We can choose between using the implicit realignment scheme (generating 6601 a misaligned_move stmt) and the explicit realignment scheme (generating 6602 aligned loads with a REALIGN_LOAD). There are two variants to the 6603 explicit realignment scheme: optimized, and unoptimized. 6604 We can optimize the realignment only if the step between consecutive 6605 vector loads is equal to the vector size. Since the vector memory 6606 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6607 is guaranteed that the misalignment amount remains the same throughout the 6608 execution of the vectorized loop. Therefore, we can create the 6609 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6610 at the loop preheader. 6611 6612 However, in the case of outer-loop vectorization, when vectorizing a 6613 memory access in the inner-loop nested within the LOOP that is now being 6614 vectorized, while it is guaranteed that the misalignment of the 6615 vectorized memory access will remain the same in different outer-loop 6616 iterations, it is *not* guaranteed that is will remain the same throughout 6617 the execution of the inner-loop. This is because the inner-loop advances 6618 with the original scalar step (and not in steps of VS). If the inner-loop 6619 step happens to be a multiple of VS, then the misalignment remains fixed 6620 and we can use the optimized realignment scheme. For example: 6621 6622 for (i=0; i<N; i++) 6623 for (j=0; j<M; j++) 6624 s += a[i+j]; 6625 6626 When vectorizing the i-loop in the above example, the step between 6627 consecutive vector loads is 1, and so the misalignment does not remain 6628 fixed across the execution of the inner-loop, and the realignment cannot 6629 be optimized (as illustrated in the following pseudo vectorized loop): 6630 6631 for (i=0; i<N; i+=4) 6632 for (j=0; j<M; j++){ 6633 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6634 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6635 // (assuming that we start from an aligned address). 6636 } 6637 6638 We therefore have to use the unoptimized realignment scheme: 6639 6640 for (i=0; i<N; i+=4) 6641 for (j=k; j<M; j+=4) 6642 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6643 // that the misalignment of the initial address is 6644 // 0). 6645 6646 The loop can then be vectorized as follows: 6647 6648 for (k=0; k<4; k++){ 6649 rt = get_realignment_token (&vp[k]); 6650 for (i=0; i<N; i+=4){ 6651 v1 = vp[i+k]; 6652 for (j=k; j<M; j+=4){ 6653 v2 = vp[i+j+VS-1]; 6654 va = REALIGN_LOAD <v1,v2,rt>; 6655 vs += va; 6656 v1 = v2; 6657 } 6658 } 6659 } */ 6660 6661 if (DR_IS_READ (dr)) 6662 { 6663 bool is_packed = false; 6664 tree type = (TREE_TYPE (DR_REF (dr))); 6665 6666 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6667 && (!targetm.vectorize.builtin_mask_for_load 6668 || targetm.vectorize.builtin_mask_for_load ())) 6669 { 6670 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6671 6672 /* If we are doing SLP then the accesses need not have the 6673 same alignment, instead it depends on the SLP group size. */ 6674 if (loop_vinfo 6675 && STMT_SLP_TYPE (stmt_info) 6676 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6677 * GROUP_SIZE (vinfo_for_stmt 6678 (GROUP_FIRST_ELEMENT (stmt_info))), 6679 TYPE_VECTOR_SUBPARTS (vectype))) 6680 ; 6681 else if (!loop_vinfo 6682 || (nested_in_vect_loop 6683 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6684 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6685 return dr_explicit_realign; 6686 else 6687 return dr_explicit_realign_optimized; 6688 } 6689 if (!known_alignment_for_access_p (dr)) 6690 is_packed = not_size_aligned (DR_REF (dr)); 6691 6692 if (targetm.vectorize.support_vector_misalignment 6693 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6694 /* Can't software pipeline the loads, but can at least do them. */ 6695 return dr_unaligned_supported; 6696 } 6697 else 6698 { 6699 bool is_packed = false; 6700 tree type = (TREE_TYPE (DR_REF (dr))); 6701 6702 if (!known_alignment_for_access_p (dr)) 6703 is_packed = not_size_aligned (DR_REF (dr)); 6704 6705 if (targetm.vectorize.support_vector_misalignment 6706 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6707 return dr_unaligned_supported; 6708 } 6709 6710 /* Unsupported. */ 6711 return dr_unaligned_unsupported; 6712 } 6713