1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2018 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "tree-cfg.h" 54 #include "tree-hash-traits.h" 55 #include "vec-perm-indices.h" 56 #include "internal-fn.h" 57 58 /* Return true if load- or store-lanes optab OPTAB is implemented for 59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 60 61 static bool 62 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 63 tree vectype, unsigned HOST_WIDE_INT count) 64 { 65 machine_mode mode, array_mode; 66 bool limit_p; 67 68 mode = TYPE_MODE (vectype); 69 if (!targetm.array_mode (mode, count).exists (&array_mode)) 70 { 71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 72 limit_p = !targetm.array_mode_supported_p (mode, count); 73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 74 { 75 if (dump_enabled_p ()) 76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 77 "no array mode for %s[" 78 HOST_WIDE_INT_PRINT_DEC "]\n", 79 GET_MODE_NAME (mode), count); 80 return false; 81 } 82 } 83 84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 85 { 86 if (dump_enabled_p ()) 87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 88 "cannot use %s<%s><%s>\n", name, 89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 90 return false; 91 } 92 93 if (dump_enabled_p ()) 94 dump_printf_loc (MSG_NOTE, vect_location, 95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 96 GET_MODE_NAME (mode)); 97 98 return true; 99 } 100 101 102 /* Return the smallest scalar part of STMT. 103 This is used to determine the vectype of the stmt. We generally set the 104 vectype according to the type of the result (lhs). For stmts whose 105 result-type is different than the type of the arguments (e.g., demotion, 106 promotion), vectype will be reset appropriately (later). Note that we have 107 to visit the smallest datatype in this function, because that determines the 108 VF. If the smallest datatype in the loop is present only as the rhs of a 109 promotion operation - we'd miss it. 110 Such a case, where a variable of this datatype does not appear in the lhs 111 anywhere in the loop, can only occur if it's an invariant: e.g.: 112 'int_x = (int) short_inv', which we'd expect to have been optimized away by 113 invariant motion. However, we cannot rely on invariant motion to always 114 take invariants out of the loop, and so in the case of promotion we also 115 have to check the rhs. 116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 117 types. */ 118 119 tree 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 121 HOST_WIDE_INT *rhs_size_unit) 122 { 123 tree scalar_type = gimple_expr_type (stmt); 124 HOST_WIDE_INT lhs, rhs; 125 126 /* During the analysis phase, this function is called on arbitrary 127 statements that might not have scalar results. */ 128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 129 return scalar_type; 130 131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 132 133 if (is_gimple_assign (stmt) 134 && (gimple_assign_cast_p (stmt) 135 || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR 136 || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR 137 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 138 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 139 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 140 { 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 142 143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 144 if (rhs < lhs) 145 scalar_type = rhs_type; 146 } 147 148 *lhs_size_unit = lhs; 149 *rhs_size_unit = rhs; 150 return scalar_type; 151 } 152 153 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 155 tested at run-time. Return TRUE if DDR was successfully inserted. 156 Return false if versioning is not supported. */ 157 158 static bool 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 160 { 161 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 162 163 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 164 return false; 165 166 if (!runtime_alias_check_p (ddr, loop, 167 optimize_loop_nest_for_speed_p (loop))) 168 return false; 169 170 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 171 return true; 172 } 173 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 175 176 static void 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 178 { 179 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 180 for (unsigned int i = 0; i < checks.length(); ++i) 181 if (checks[i] == value) 182 return; 183 184 if (dump_enabled_p ()) 185 { 186 dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that "); 187 dump_generic_expr (MSG_NOTE, TDF_SLIM, value); 188 dump_printf (MSG_NOTE, " is nonzero\n"); 189 } 190 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 191 } 192 193 /* Return true if we know that the order of vectorized STMT_A and 194 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 195 At least one of the statements is a write. */ 196 197 static bool 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 199 { 200 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 201 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 202 203 /* Single statements are always kept in their original order. */ 204 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 205 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 206 return true; 207 208 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 209 group are emitted at the position of the last scalar load and all 210 stores in a group are emitted at the position of the last scalar store. 211 Compute that position and check whether the resulting order matches 212 the current one. */ 213 gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 214 if (last_a) 215 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; 216 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 217 last_a = get_later_stmt (last_a, s); 218 else 219 last_a = stmt_a; 220 gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 221 if (last_b) 222 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; 223 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 224 last_b = get_later_stmt (last_b, s); 225 else 226 last_b = stmt_b; 227 return ((get_later_stmt (last_a, last_b) == last_a) 228 == (get_later_stmt (stmt_a, stmt_b) == stmt_a)); 229 } 230 231 /* A subroutine of vect_analyze_data_ref_dependence. Handle 232 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 233 distances. These distances are conservatively correct but they don't 234 reflect a guaranteed dependence. 235 236 Return true if this function does all the work necessary to avoid 237 an alias or false if the caller should use the dependence distances 238 to limit the vectorization factor in the usual way. LOOP_DEPTH is 239 the depth of the loop described by LOOP_VINFO and the other arguments 240 are as for vect_analyze_data_ref_dependence. */ 241 242 static bool 243 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 244 loop_vec_info loop_vinfo, 245 int loop_depth, unsigned int *max_vf) 246 { 247 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 248 lambda_vector dist_v; 249 unsigned int i; 250 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 251 { 252 int dist = dist_v[loop_depth]; 253 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 254 { 255 /* If the user asserted safelen >= DIST consecutive iterations 256 can be executed concurrently, assume independence. 257 258 ??? An alternative would be to add the alias check even 259 in this case, and vectorize the fallback loop with the 260 maximum VF set to safelen. However, if the user has 261 explicitly given a length, it's less likely that that 262 would be a win. */ 263 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 264 { 265 if ((unsigned int) loop->safelen < *max_vf) 266 *max_vf = loop->safelen; 267 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 268 continue; 269 } 270 271 /* For dependence distances of 2 or more, we have the option 272 of limiting VF or checking for an alias at runtime. 273 Prefer to check at runtime if we can, to avoid limiting 274 the VF unnecessarily when the bases are in fact independent. 275 276 Note that the alias checks will be removed if the VF ends up 277 being small enough. */ 278 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 279 } 280 } 281 return true; 282 } 283 284 285 /* Function vect_analyze_data_ref_dependence. 286 287 Return TRUE if there (might) exist a dependence between a memory-reference 288 DRA and a memory-reference DRB. When versioning for alias may check a 289 dependence at run-time, return FALSE. Adjust *MAX_VF according to 290 the data dependence. */ 291 292 static bool 293 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 294 loop_vec_info loop_vinfo, 295 unsigned int *max_vf) 296 { 297 unsigned int i; 298 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 299 struct data_reference *dra = DDR_A (ddr); 300 struct data_reference *drb = DDR_B (ddr); 301 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 302 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 303 lambda_vector dist_v; 304 unsigned int loop_depth; 305 306 /* In loop analysis all data references should be vectorizable. */ 307 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 308 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 309 gcc_unreachable (); 310 311 /* Independent data accesses. */ 312 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 313 return false; 314 315 if (dra == drb 316 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 317 return false; 318 319 /* We do not have to consider dependences between accesses that belong 320 to the same group, unless the stride could be smaller than the 321 group size. */ 322 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 323 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b) 324 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 325 return false; 326 327 /* Even if we have an anti-dependence then, as the vectorized loop covers at 328 least two scalar iterations, there is always also a true dependence. 329 As the vectorizer does not re-order loads and stores we can ignore 330 the anti-dependence if TBAA can disambiguate both DRs similar to the 331 case with known negative distance anti-dependences (positive 332 distance anti-dependences would violate TBAA constraints). */ 333 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 334 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 335 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 336 get_alias_set (DR_REF (drb)))) 337 return false; 338 339 /* Unknown data dependence. */ 340 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 341 { 342 /* If user asserted safelen consecutive iterations can be 343 executed concurrently, assume independence. */ 344 if (loop->safelen >= 2) 345 { 346 if ((unsigned int) loop->safelen < *max_vf) 347 *max_vf = loop->safelen; 348 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 349 return false; 350 } 351 352 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 353 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 354 { 355 if (dump_enabled_p ()) 356 { 357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 358 "versioning for alias not supported for: " 359 "can't determine dependence between "); 360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 361 DR_REF (dra)); 362 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 364 DR_REF (drb)); 365 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 366 } 367 return true; 368 } 369 370 if (dump_enabled_p ()) 371 { 372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 373 "versioning for alias required: " 374 "can't determine dependence between "); 375 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 376 DR_REF (dra)); 377 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 378 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 379 DR_REF (drb)); 380 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 381 } 382 383 /* Add to list of ddrs that need to be tested at run-time. */ 384 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 385 } 386 387 /* Known data dependence. */ 388 if (DDR_NUM_DIST_VECTS (ddr) == 0) 389 { 390 /* If user asserted safelen consecutive iterations can be 391 executed concurrently, assume independence. */ 392 if (loop->safelen >= 2) 393 { 394 if ((unsigned int) loop->safelen < *max_vf) 395 *max_vf = loop->safelen; 396 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 397 return false; 398 } 399 400 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 401 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 402 { 403 if (dump_enabled_p ()) 404 { 405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 406 "versioning for alias not supported for: " 407 "bad dist vector for "); 408 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 409 DR_REF (dra)); 410 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 411 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 412 DR_REF (drb)); 413 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 414 } 415 return true; 416 } 417 418 if (dump_enabled_p ()) 419 { 420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 421 "versioning for alias required: " 422 "bad dist vector for "); 423 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 424 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 425 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 426 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 427 } 428 /* Add to list of ddrs that need to be tested at run-time. */ 429 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 430 } 431 432 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 433 434 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 435 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 436 loop_depth, max_vf)) 437 return false; 438 439 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 440 { 441 int dist = dist_v[loop_depth]; 442 443 if (dump_enabled_p ()) 444 dump_printf_loc (MSG_NOTE, vect_location, 445 "dependence distance = %d.\n", dist); 446 447 if (dist == 0) 448 { 449 if (dump_enabled_p ()) 450 { 451 dump_printf_loc (MSG_NOTE, vect_location, 452 "dependence distance == 0 between "); 453 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 454 dump_printf (MSG_NOTE, " and "); 455 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 456 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 457 } 458 459 /* When we perform grouped accesses and perform implicit CSE 460 by detecting equal accesses and doing disambiguation with 461 runtime alias tests like for 462 .. = a[i]; 463 .. = a[i+1]; 464 a[i] = ..; 465 a[i+1] = ..; 466 *p = ..; 467 .. = a[i]; 468 .. = a[i+1]; 469 where we will end up loading { a[i], a[i+1] } once, make 470 sure that inserting group loads before the first load and 471 stores after the last store will do the right thing. 472 Similar for groups like 473 a[i] = ...; 474 ... = a[i]; 475 a[i+1] = ...; 476 where loads from the group interleave with the store. */ 477 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 478 { 479 if (dump_enabled_p ()) 480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 481 "READ_WRITE dependence in interleaving.\n"); 482 return true; 483 } 484 485 if (loop->safelen < 2) 486 { 487 tree indicator = dr_zero_step_indicator (dra); 488 if (TREE_CODE (indicator) != INTEGER_CST) 489 vect_check_nonzero_value (loop_vinfo, indicator); 490 else if (integer_zerop (indicator)) 491 { 492 if (dump_enabled_p ()) 493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 494 "access also has a zero step\n"); 495 return true; 496 } 497 } 498 continue; 499 } 500 501 if (dist > 0 && DDR_REVERSED_P (ddr)) 502 { 503 /* If DDR_REVERSED_P the order of the data-refs in DDR was 504 reversed (to make distance vector positive), and the actual 505 distance is negative. */ 506 if (dump_enabled_p ()) 507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 508 "dependence distance negative.\n"); 509 /* Record a negative dependence distance to later limit the 510 amount of stmt copying / unrolling we can perform. 511 Only need to handle read-after-write dependence. */ 512 if (DR_IS_READ (drb) 513 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 514 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 515 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 516 continue; 517 } 518 519 unsigned int abs_dist = abs (dist); 520 if (abs_dist >= 2 && abs_dist < *max_vf) 521 { 522 /* The dependence distance requires reduction of the maximal 523 vectorization factor. */ 524 *max_vf = abs (dist); 525 if (dump_enabled_p ()) 526 dump_printf_loc (MSG_NOTE, vect_location, 527 "adjusting maximal vectorization factor to %i\n", 528 *max_vf); 529 } 530 531 if (abs_dist >= *max_vf) 532 { 533 /* Dependence distance does not create dependence, as far as 534 vectorization is concerned, in this case. */ 535 if (dump_enabled_p ()) 536 dump_printf_loc (MSG_NOTE, vect_location, 537 "dependence distance >= VF.\n"); 538 continue; 539 } 540 541 if (dump_enabled_p ()) 542 { 543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 544 "not vectorized, possible dependence " 545 "between data-refs "); 546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 547 dump_printf (MSG_NOTE, " and "); 548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 549 dump_printf (MSG_NOTE, "\n"); 550 } 551 552 return true; 553 } 554 555 return false; 556 } 557 558 /* Function vect_analyze_data_ref_dependences. 559 560 Examine all the data references in the loop, and make sure there do not 561 exist any data dependences between them. Set *MAX_VF according to 562 the maximum vectorization factor the data dependences allow. */ 563 564 bool 565 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 566 unsigned int *max_vf) 567 { 568 unsigned int i; 569 struct data_dependence_relation *ddr; 570 571 if (dump_enabled_p ()) 572 dump_printf_loc (MSG_NOTE, vect_location, 573 "=== vect_analyze_data_ref_dependences ===\n"); 574 575 LOOP_VINFO_DDRS (loop_vinfo) 576 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 577 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 578 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 579 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 580 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 581 &LOOP_VINFO_DDRS (loop_vinfo), 582 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 583 return false; 584 585 /* For epilogues we either have no aliases or alias versioning 586 was applied to original loop. Therefore we may just get max_vf 587 using VF of original loop. */ 588 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 589 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 590 else 591 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 592 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 593 return false; 594 595 return true; 596 } 597 598 599 /* Function vect_slp_analyze_data_ref_dependence. 600 601 Return TRUE if there (might) exist a dependence between a memory-reference 602 DRA and a memory-reference DRB. When versioning for alias may check a 603 dependence at run-time, return FALSE. Adjust *MAX_VF according to 604 the data dependence. */ 605 606 static bool 607 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 608 { 609 struct data_reference *dra = DDR_A (ddr); 610 struct data_reference *drb = DDR_B (ddr); 611 612 /* We need to check dependences of statements marked as unvectorizable 613 as well, they still can prohibit vectorization. */ 614 615 /* Independent data accesses. */ 616 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 617 return false; 618 619 if (dra == drb) 620 return false; 621 622 /* Read-read is OK. */ 623 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 624 return false; 625 626 /* If dra and drb are part of the same interleaving chain consider 627 them independent. */ 628 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 629 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 630 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 631 return false; 632 633 /* Unknown data dependence. */ 634 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 635 { 636 if (dump_enabled_p ()) 637 { 638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 639 "can't determine dependence between "); 640 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 641 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 642 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 643 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 644 } 645 } 646 else if (dump_enabled_p ()) 647 { 648 dump_printf_loc (MSG_NOTE, vect_location, 649 "determined dependence between "); 650 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 651 dump_printf (MSG_NOTE, " and "); 652 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 653 dump_printf (MSG_NOTE, "\n"); 654 } 655 656 return true; 657 } 658 659 660 /* Analyze dependences involved in the transform of SLP NODE. STORES 661 contain the vector of scalar stores of this instance if we are 662 disambiguating the loads. */ 663 664 static bool 665 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 666 vec<gimple *> stores, gimple *last_store) 667 { 668 /* This walks over all stmts involved in the SLP load/store done 669 in NODE verifying we can sink them up to the last stmt in the 670 group. */ 671 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 672 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 673 { 674 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 675 if (access == last_access) 676 continue; 677 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 678 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 679 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 680 { 681 gimple *stmt = gsi_stmt (gsi); 682 if (! gimple_vuse (stmt) 683 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 684 continue; 685 686 /* If we couldn't record a (single) data reference for this 687 stmt we have to give up. */ 688 /* ??? Here and below if dependence analysis fails we can resort 689 to the alias oracle which can handle more kinds of stmts. */ 690 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 691 if (!dr_b) 692 return false; 693 694 bool dependent = false; 695 /* If we run into a store of this same instance (we've just 696 marked those) then delay dependence checking until we run 697 into the last store because this is where it will have 698 been sunk to (and we verify if we can do that as well). */ 699 if (gimple_visited_p (stmt)) 700 { 701 if (stmt != last_store) 702 continue; 703 unsigned i; 704 gimple *store; 705 FOR_EACH_VEC_ELT (stores, i, store) 706 { 707 data_reference *store_dr 708 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 709 ddr_p ddr = initialize_data_dependence_relation 710 (dr_a, store_dr, vNULL); 711 dependent = vect_slp_analyze_data_ref_dependence (ddr); 712 free_dependence_relation (ddr); 713 if (dependent) 714 break; 715 } 716 } 717 else 718 { 719 ddr_p ddr = initialize_data_dependence_relation (dr_a, 720 dr_b, vNULL); 721 dependent = vect_slp_analyze_data_ref_dependence (ddr); 722 free_dependence_relation (ddr); 723 } 724 if (dependent) 725 return false; 726 } 727 } 728 return true; 729 } 730 731 732 /* Function vect_analyze_data_ref_dependences. 733 734 Examine all the data references in the basic-block, and make sure there 735 do not exist any data dependences between them. Set *MAX_VF according to 736 the maximum vectorization factor the data dependences allow. */ 737 738 bool 739 vect_slp_analyze_instance_dependence (slp_instance instance) 740 { 741 if (dump_enabled_p ()) 742 dump_printf_loc (MSG_NOTE, vect_location, 743 "=== vect_slp_analyze_instance_dependence ===\n"); 744 745 /* The stores of this instance are at the root of the SLP tree. */ 746 slp_tree store = SLP_INSTANCE_TREE (instance); 747 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 748 store = NULL; 749 750 /* Verify we can sink stores to the vectorized stmt insert location. */ 751 gimple *last_store = NULL; 752 if (store) 753 { 754 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 755 return false; 756 757 /* Mark stores in this instance and remember the last one. */ 758 last_store = vect_find_last_scalar_stmt_in_slp (store); 759 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 760 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 761 } 762 763 bool res = true; 764 765 /* Verify we can sink loads to the vectorized stmt insert location, 766 special-casing stores of this instance. */ 767 slp_tree load; 768 unsigned int i; 769 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 770 if (! vect_slp_analyze_node_dependences (instance, load, 771 store 772 ? SLP_TREE_SCALAR_STMTS (store) 773 : vNULL, last_store)) 774 { 775 res = false; 776 break; 777 } 778 779 /* Unset the visited flag. */ 780 if (store) 781 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 782 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 783 784 return res; 785 } 786 787 /* Record in VINFO the base alignment guarantee given by DRB. STMT is 788 the statement that contains DRB, which is useful for recording in the 789 dump file. */ 790 791 static void 792 vect_record_base_alignment (vec_info *vinfo, gimple *stmt, 793 innermost_loop_behavior *drb) 794 { 795 bool existed; 796 innermost_loop_behavior *&entry 797 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 798 if (!existed || entry->base_alignment < drb->base_alignment) 799 { 800 entry = drb; 801 if (dump_enabled_p ()) 802 { 803 dump_printf_loc (MSG_NOTE, vect_location, 804 "recording new base alignment for "); 805 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address); 806 dump_printf (MSG_NOTE, "\n"); 807 dump_printf_loc (MSG_NOTE, vect_location, 808 " alignment: %d\n", drb->base_alignment); 809 dump_printf_loc (MSG_NOTE, vect_location, 810 " misalignment: %d\n", drb->base_misalignment); 811 dump_printf_loc (MSG_NOTE, vect_location, 812 " based on: "); 813 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 814 } 815 } 816 } 817 818 /* If the region we're going to vectorize is reached, all unconditional 819 data references occur at least once. We can therefore pool the base 820 alignment guarantees from each unconditional reference. Do this by 821 going through all the data references in VINFO and checking whether 822 the containing statement makes the reference unconditionally. If so, 823 record the alignment of the base address in VINFO so that it can be 824 used for all other references with the same base. */ 825 826 void 827 vect_record_base_alignments (vec_info *vinfo) 828 { 829 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 830 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 831 data_reference *dr; 832 unsigned int i; 833 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr) 834 if (!DR_IS_CONDITIONAL_IN_STMT (dr)) 835 { 836 gimple *stmt = DR_STMT (dr); 837 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr)); 838 839 /* If DR is nested in the loop that is being vectorized, we can also 840 record the alignment of the base wrt the outer loop. */ 841 if (loop && nested_in_vect_loop_p (loop, stmt)) 842 { 843 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 844 vect_record_base_alignment 845 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 846 } 847 } 848 } 849 850 /* Return the target alignment for the vectorized form of DR. */ 851 852 static unsigned int 853 vect_calculate_target_alignment (struct data_reference *dr) 854 { 855 gimple *stmt = DR_STMT (dr); 856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 857 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 858 return targetm.vectorize.preferred_vector_alignment (vectype); 859 } 860 861 /* Function vect_compute_data_ref_alignment 862 863 Compute the misalignment of the data reference DR. 864 865 Output: 866 1. If during the misalignment computation it is found that the data reference 867 cannot be vectorized then false is returned. 868 2. DR_MISALIGNMENT (DR) is defined. 869 870 FOR NOW: No analysis is actually performed. Misalignment is calculated 871 only for trivial cases. TODO. */ 872 873 bool 874 vect_compute_data_ref_alignment (struct data_reference *dr) 875 { 876 gimple *stmt = DR_STMT (dr); 877 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 878 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 879 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 880 struct loop *loop = NULL; 881 tree ref = DR_REF (dr); 882 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 883 884 if (dump_enabled_p ()) 885 dump_printf_loc (MSG_NOTE, vect_location, 886 "vect_compute_data_ref_alignment:\n"); 887 888 if (loop_vinfo) 889 loop = LOOP_VINFO_LOOP (loop_vinfo); 890 891 /* Initialize misalignment to unknown. */ 892 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 893 894 innermost_loop_behavior *drb = vect_dr_behavior (dr); 895 bool step_preserves_misalignment_p; 896 897 unsigned HOST_WIDE_INT vector_alignment 898 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT; 899 DR_TARGET_ALIGNMENT (dr) = vector_alignment; 900 901 /* No step for BB vectorization. */ 902 if (!loop) 903 { 904 gcc_assert (integer_zerop (drb->step)); 905 step_preserves_misalignment_p = true; 906 } 907 908 /* In case the dataref is in an inner-loop of the loop that is being 909 vectorized (LOOP), we use the base and misalignment information 910 relative to the outer-loop (LOOP). This is ok only if the misalignment 911 stays the same throughout the execution of the inner-loop, which is why 912 we have to check that the stride of the dataref in the inner-loop evenly 913 divides by the vector alignment. */ 914 else if (nested_in_vect_loop_p (loop, stmt)) 915 { 916 step_preserves_misalignment_p 917 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0; 918 919 if (dump_enabled_p ()) 920 { 921 if (step_preserves_misalignment_p) 922 dump_printf_loc (MSG_NOTE, vect_location, 923 "inner step divides the vector alignment.\n"); 924 else 925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 926 "inner step doesn't divide the vector" 927 " alignment.\n"); 928 } 929 } 930 931 /* Similarly we can only use base and misalignment information relative to 932 an innermost loop if the misalignment stays the same throughout the 933 execution of the loop. As above, this is the case if the stride of 934 the dataref evenly divides by the alignment. */ 935 else 936 { 937 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 938 step_preserves_misalignment_p 939 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment); 940 941 if (!step_preserves_misalignment_p && dump_enabled_p ()) 942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 943 "step doesn't divide the vector alignment.\n"); 944 } 945 946 unsigned int base_alignment = drb->base_alignment; 947 unsigned int base_misalignment = drb->base_misalignment; 948 949 /* Calculate the maximum of the pooled base address alignment and the 950 alignment that we can compute for DR itself. */ 951 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 952 if (entry && base_alignment < (*entry)->base_alignment) 953 { 954 base_alignment = (*entry)->base_alignment; 955 base_misalignment = (*entry)->base_misalignment; 956 } 957 958 if (drb->offset_alignment < vector_alignment 959 || !step_preserves_misalignment_p 960 /* We need to know whether the step wrt the vectorized loop is 961 negative when computing the starting misalignment below. */ 962 || TREE_CODE (drb->step) != INTEGER_CST) 963 { 964 if (dump_enabled_p ()) 965 { 966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 967 "Unknown alignment for access: "); 968 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 969 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 970 } 971 return true; 972 } 973 974 if (base_alignment < vector_alignment) 975 { 976 unsigned int max_alignment; 977 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 978 if (max_alignment < vector_alignment 979 || !vect_can_force_dr_alignment_p (base, 980 vector_alignment * BITS_PER_UNIT)) 981 { 982 if (dump_enabled_p ()) 983 { 984 dump_printf_loc (MSG_NOTE, vect_location, 985 "can't force alignment of ref: "); 986 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 987 dump_printf (MSG_NOTE, "\n"); 988 } 989 return true; 990 } 991 992 /* Force the alignment of the decl. 993 NOTE: This is the only change to the code we make during 994 the analysis phase, before deciding to vectorize the loop. */ 995 if (dump_enabled_p ()) 996 { 997 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 998 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 999 dump_printf (MSG_NOTE, "\n"); 1000 } 1001 1002 DR_VECT_AUX (dr)->base_decl = base; 1003 DR_VECT_AUX (dr)->base_misaligned = true; 1004 base_misalignment = 0; 1005 } 1006 poly_int64 misalignment 1007 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 1008 1009 /* If this is a backward running DR then first access in the larger 1010 vectype actually is N-1 elements before the address in the DR. 1011 Adjust misalign accordingly. */ 1012 if (tree_int_cst_sgn (drb->step) < 0) 1013 /* PLUS because STEP is negative. */ 1014 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1015 * TREE_INT_CST_LOW (drb->step)); 1016 1017 unsigned int const_misalignment; 1018 if (!known_misalignment (misalignment, vector_alignment, 1019 &const_misalignment)) 1020 { 1021 if (dump_enabled_p ()) 1022 { 1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1024 "Non-constant misalignment for access: "); 1025 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1026 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1027 } 1028 return true; 1029 } 1030 1031 SET_DR_MISALIGNMENT (dr, const_misalignment); 1032 1033 if (dump_enabled_p ()) 1034 { 1035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1036 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 1037 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1038 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1039 } 1040 1041 return true; 1042 } 1043 1044 /* Function vect_update_misalignment_for_peel. 1045 Sets DR's misalignment 1046 - to 0 if it has the same alignment as DR_PEEL, 1047 - to the misalignment computed using NPEEL if DR's salignment is known, 1048 - to -1 (unknown) otherwise. 1049 1050 DR - the data reference whose misalignment is to be adjusted. 1051 DR_PEEL - the data reference whose misalignment is being made 1052 zero in the vector loop by the peel. 1053 NPEEL - the number of iterations in the peel loop if the misalignment 1054 of DR_PEEL is known at compile time. */ 1055 1056 static void 1057 vect_update_misalignment_for_peel (struct data_reference *dr, 1058 struct data_reference *dr_peel, int npeel) 1059 { 1060 unsigned int i; 1061 vec<dr_p> same_aligned_drs; 1062 struct data_reference *current_dr; 1063 int dr_size = vect_get_scalar_dr_size (dr); 1064 int dr_peel_size = vect_get_scalar_dr_size (dr_peel); 1065 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1066 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1067 1068 /* For interleaved data accesses the step in the loop must be multiplied by 1069 the size of the interleaving group. */ 1070 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1071 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1072 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1073 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1074 1075 /* It can be assumed that the data refs with the same alignment as dr_peel 1076 are aligned in the vector loop. */ 1077 same_aligned_drs 1078 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1079 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1080 { 1081 if (current_dr != dr) 1082 continue; 1083 gcc_assert (!known_alignment_for_access_p (dr) 1084 || !known_alignment_for_access_p (dr_peel) 1085 || (DR_MISALIGNMENT (dr) / dr_size 1086 == DR_MISALIGNMENT (dr_peel) / dr_peel_size)); 1087 SET_DR_MISALIGNMENT (dr, 0); 1088 return; 1089 } 1090 1091 if (known_alignment_for_access_p (dr) 1092 && known_alignment_for_access_p (dr_peel)) 1093 { 1094 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1095 int misal = DR_MISALIGNMENT (dr); 1096 misal += negative ? -npeel * dr_size : npeel * dr_size; 1097 misal &= DR_TARGET_ALIGNMENT (dr) - 1; 1098 SET_DR_MISALIGNMENT (dr, misal); 1099 return; 1100 } 1101 1102 if (dump_enabled_p ()) 1103 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1104 "to unknown (-1).\n"); 1105 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN); 1106 } 1107 1108 1109 /* Function verify_data_ref_alignment 1110 1111 Return TRUE if DR can be handled with respect to alignment. */ 1112 1113 static bool 1114 verify_data_ref_alignment (data_reference_p dr) 1115 { 1116 enum dr_alignment_support supportable_dr_alignment 1117 = vect_supportable_dr_alignment (dr, false); 1118 if (!supportable_dr_alignment) 1119 { 1120 if (dump_enabled_p ()) 1121 { 1122 if (DR_IS_READ (dr)) 1123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1124 "not vectorized: unsupported unaligned load."); 1125 else 1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1127 "not vectorized: unsupported unaligned " 1128 "store."); 1129 1130 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1131 DR_REF (dr)); 1132 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1133 } 1134 return false; 1135 } 1136 1137 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1138 dump_printf_loc (MSG_NOTE, vect_location, 1139 "Vectorizing an unaligned access.\n"); 1140 1141 return true; 1142 } 1143 1144 /* Function vect_verify_datarefs_alignment 1145 1146 Return TRUE if all data references in the loop can be 1147 handled with respect to alignment. */ 1148 1149 bool 1150 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1151 { 1152 vec<data_reference_p> datarefs = vinfo->datarefs; 1153 struct data_reference *dr; 1154 unsigned int i; 1155 1156 FOR_EACH_VEC_ELT (datarefs, i, dr) 1157 { 1158 gimple *stmt = DR_STMT (dr); 1159 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1160 1161 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1162 continue; 1163 1164 /* For interleaving, only the alignment of the first access matters. */ 1165 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1166 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1167 continue; 1168 1169 /* Strided accesses perform only component accesses, alignment is 1170 irrelevant for them. */ 1171 if (STMT_VINFO_STRIDED_P (stmt_info) 1172 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1173 continue; 1174 1175 if (! verify_data_ref_alignment (dr)) 1176 return false; 1177 } 1178 1179 return true; 1180 } 1181 1182 /* Given an memory reference EXP return whether its alignment is less 1183 than its size. */ 1184 1185 static bool 1186 not_size_aligned (tree exp) 1187 { 1188 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1189 return true; 1190 1191 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1192 > get_object_alignment (exp)); 1193 } 1194 1195 /* Function vector_alignment_reachable_p 1196 1197 Return true if vector alignment for DR is reachable by peeling 1198 a few loop iterations. Return false otherwise. */ 1199 1200 static bool 1201 vector_alignment_reachable_p (struct data_reference *dr) 1202 { 1203 gimple *stmt = DR_STMT (dr); 1204 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1205 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1206 1207 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1208 { 1209 /* For interleaved access we peel only if number of iterations in 1210 the prolog loop ({VF - misalignment}), is a multiple of the 1211 number of the interleaved accesses. */ 1212 int elem_size, mis_in_elements; 1213 1214 /* FORNOW: handle only known alignment. */ 1215 if (!known_alignment_for_access_p (dr)) 1216 return false; 1217 1218 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1219 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1220 elem_size = vector_element_size (vector_size, nelements); 1221 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1222 1223 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info))) 1224 return false; 1225 } 1226 1227 /* If misalignment is known at the compile time then allow peeling 1228 only if natural alignment is reachable through peeling. */ 1229 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1230 { 1231 HOST_WIDE_INT elmsize = 1232 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1233 if (dump_enabled_p ()) 1234 { 1235 dump_printf_loc (MSG_NOTE, vect_location, 1236 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1237 dump_printf (MSG_NOTE, 1238 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1239 } 1240 if (DR_MISALIGNMENT (dr) % elmsize) 1241 { 1242 if (dump_enabled_p ()) 1243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1244 "data size does not divide the misalignment.\n"); 1245 return false; 1246 } 1247 } 1248 1249 if (!known_alignment_for_access_p (dr)) 1250 { 1251 tree type = TREE_TYPE (DR_REF (dr)); 1252 bool is_packed = not_size_aligned (DR_REF (dr)); 1253 if (dump_enabled_p ()) 1254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1255 "Unknown misalignment, %snaturally aligned\n", 1256 is_packed ? "not " : ""); 1257 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1258 } 1259 1260 return true; 1261 } 1262 1263 1264 /* Calculate the cost of the memory access represented by DR. */ 1265 1266 static void 1267 vect_get_data_access_cost (struct data_reference *dr, 1268 unsigned int *inside_cost, 1269 unsigned int *outside_cost, 1270 stmt_vector_for_cost *body_cost_vec) 1271 { 1272 gimple *stmt = DR_STMT (dr); 1273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1274 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1275 int ncopies; 1276 1277 if (PURE_SLP_STMT (stmt_info)) 1278 ncopies = 1; 1279 else 1280 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1281 1282 if (DR_IS_READ (dr)) 1283 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1284 NULL, body_cost_vec, false); 1285 else 1286 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1287 1288 if (dump_enabled_p ()) 1289 dump_printf_loc (MSG_NOTE, vect_location, 1290 "vect_get_data_access_cost: inside_cost = %d, " 1291 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1292 } 1293 1294 1295 typedef struct _vect_peel_info 1296 { 1297 struct data_reference *dr; 1298 int npeel; 1299 unsigned int count; 1300 } *vect_peel_info; 1301 1302 typedef struct _vect_peel_extended_info 1303 { 1304 struct _vect_peel_info peel_info; 1305 unsigned int inside_cost; 1306 unsigned int outside_cost; 1307 } *vect_peel_extended_info; 1308 1309 1310 /* Peeling hashtable helpers. */ 1311 1312 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1313 { 1314 static inline hashval_t hash (const _vect_peel_info *); 1315 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1316 }; 1317 1318 inline hashval_t 1319 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1320 { 1321 return (hashval_t) peel_info->npeel; 1322 } 1323 1324 inline bool 1325 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1326 { 1327 return (a->npeel == b->npeel); 1328 } 1329 1330 1331 /* Insert DR into peeling hash table with NPEEL as key. */ 1332 1333 static void 1334 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1335 loop_vec_info loop_vinfo, struct data_reference *dr, 1336 int npeel) 1337 { 1338 struct _vect_peel_info elem, *slot; 1339 _vect_peel_info **new_slot; 1340 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1341 1342 elem.npeel = npeel; 1343 slot = peeling_htab->find (&elem); 1344 if (slot) 1345 slot->count++; 1346 else 1347 { 1348 slot = XNEW (struct _vect_peel_info); 1349 slot->npeel = npeel; 1350 slot->dr = dr; 1351 slot->count = 1; 1352 new_slot = peeling_htab->find_slot (slot, INSERT); 1353 *new_slot = slot; 1354 } 1355 1356 if (!supportable_dr_alignment 1357 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1358 slot->count += VECT_MAX_COST; 1359 } 1360 1361 1362 /* Traverse peeling hash table to find peeling option that aligns maximum 1363 number of data accesses. */ 1364 1365 int 1366 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1367 _vect_peel_extended_info *max) 1368 { 1369 vect_peel_info elem = *slot; 1370 1371 if (elem->count > max->peel_info.count 1372 || (elem->count == max->peel_info.count 1373 && max->peel_info.npeel > elem->npeel)) 1374 { 1375 max->peel_info.npeel = elem->npeel; 1376 max->peel_info.count = elem->count; 1377 max->peel_info.dr = elem->dr; 1378 } 1379 1380 return 1; 1381 } 1382 1383 /* Get the costs of peeling NPEEL iterations checking data access costs 1384 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's 1385 misalignment will be zero after peeling. */ 1386 1387 static void 1388 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs, 1389 struct data_reference *dr0, 1390 unsigned int *inside_cost, 1391 unsigned int *outside_cost, 1392 stmt_vector_for_cost *body_cost_vec, 1393 unsigned int npeel, 1394 bool unknown_misalignment) 1395 { 1396 unsigned i; 1397 data_reference *dr; 1398 1399 FOR_EACH_VEC_ELT (datarefs, i, dr) 1400 { 1401 gimple *stmt = DR_STMT (dr); 1402 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1403 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1404 continue; 1405 1406 /* For interleaving, only the alignment of the first access 1407 matters. */ 1408 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1409 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1410 continue; 1411 1412 /* Strided accesses perform only component accesses, alignment is 1413 irrelevant for them. */ 1414 if (STMT_VINFO_STRIDED_P (stmt_info) 1415 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1416 continue; 1417 1418 int save_misalignment; 1419 save_misalignment = DR_MISALIGNMENT (dr); 1420 if (npeel == 0) 1421 ; 1422 else if (unknown_misalignment && dr == dr0) 1423 SET_DR_MISALIGNMENT (dr, 0); 1424 else 1425 vect_update_misalignment_for_peel (dr, dr0, npeel); 1426 vect_get_data_access_cost (dr, inside_cost, outside_cost, 1427 body_cost_vec); 1428 SET_DR_MISALIGNMENT (dr, save_misalignment); 1429 } 1430 } 1431 1432 /* Traverse peeling hash table and calculate cost for each peeling option. 1433 Find the one with the lowest cost. */ 1434 1435 int 1436 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1437 _vect_peel_extended_info *min) 1438 { 1439 vect_peel_info elem = *slot; 1440 int dummy; 1441 unsigned int inside_cost = 0, outside_cost = 0; 1442 gimple *stmt = DR_STMT (elem->dr); 1443 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1444 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1445 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1446 epilogue_cost_vec; 1447 1448 prologue_cost_vec.create (2); 1449 body_cost_vec.create (2); 1450 epilogue_cost_vec.create (2); 1451 1452 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo), 1453 elem->dr, &inside_cost, &outside_cost, 1454 &body_cost_vec, elem->npeel, false); 1455 1456 body_cost_vec.release (); 1457 1458 outside_cost += vect_get_known_peeling_cost 1459 (loop_vinfo, elem->npeel, &dummy, 1460 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1461 &prologue_cost_vec, &epilogue_cost_vec); 1462 1463 /* Prologue and epilogue costs are added to the target model later. 1464 These costs depend only on the scalar iteration cost, the 1465 number of peeling iterations finally chosen, and the number of 1466 misaligned statements. So discard the information found here. */ 1467 prologue_cost_vec.release (); 1468 epilogue_cost_vec.release (); 1469 1470 if (inside_cost < min->inside_cost 1471 || (inside_cost == min->inside_cost 1472 && outside_cost < min->outside_cost)) 1473 { 1474 min->inside_cost = inside_cost; 1475 min->outside_cost = outside_cost; 1476 min->peel_info.dr = elem->dr; 1477 min->peel_info.npeel = elem->npeel; 1478 min->peel_info.count = elem->count; 1479 } 1480 1481 return 1; 1482 } 1483 1484 1485 /* Choose best peeling option by traversing peeling hash table and either 1486 choosing an option with the lowest cost (if cost model is enabled) or the 1487 option that aligns as many accesses as possible. */ 1488 1489 static struct _vect_peel_extended_info 1490 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1491 loop_vec_info loop_vinfo) 1492 { 1493 struct _vect_peel_extended_info res; 1494 1495 res.peel_info.dr = NULL; 1496 1497 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1498 { 1499 res.inside_cost = INT_MAX; 1500 res.outside_cost = INT_MAX; 1501 peeling_htab->traverse <_vect_peel_extended_info *, 1502 vect_peeling_hash_get_lowest_cost> (&res); 1503 } 1504 else 1505 { 1506 res.peel_info.count = 0; 1507 peeling_htab->traverse <_vect_peel_extended_info *, 1508 vect_peeling_hash_get_most_frequent> (&res); 1509 res.inside_cost = 0; 1510 res.outside_cost = 0; 1511 } 1512 1513 return res; 1514 } 1515 1516 /* Return true if the new peeling NPEEL is supported. */ 1517 1518 static bool 1519 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0, 1520 unsigned npeel) 1521 { 1522 unsigned i; 1523 struct data_reference *dr = NULL; 1524 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1525 gimple *stmt; 1526 stmt_vec_info stmt_info; 1527 enum dr_alignment_support supportable_dr_alignment; 1528 1529 /* Ensure that all data refs can be vectorized after the peel. */ 1530 FOR_EACH_VEC_ELT (datarefs, i, dr) 1531 { 1532 int save_misalignment; 1533 1534 if (dr == dr0) 1535 continue; 1536 1537 stmt = DR_STMT (dr); 1538 stmt_info = vinfo_for_stmt (stmt); 1539 /* For interleaving, only the alignment of the first access 1540 matters. */ 1541 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1542 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1543 continue; 1544 1545 /* Strided accesses perform only component accesses, alignment is 1546 irrelevant for them. */ 1547 if (STMT_VINFO_STRIDED_P (stmt_info) 1548 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1549 continue; 1550 1551 save_misalignment = DR_MISALIGNMENT (dr); 1552 vect_update_misalignment_for_peel (dr, dr0, npeel); 1553 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1554 SET_DR_MISALIGNMENT (dr, save_misalignment); 1555 1556 if (!supportable_dr_alignment) 1557 return false; 1558 } 1559 1560 return true; 1561 } 1562 1563 /* Function vect_enhance_data_refs_alignment 1564 1565 This pass will use loop versioning and loop peeling in order to enhance 1566 the alignment of data references in the loop. 1567 1568 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1569 original loop is to be vectorized. Any other loops that are created by 1570 the transformations performed in this pass - are not supposed to be 1571 vectorized. This restriction will be relaxed. 1572 1573 This pass will require a cost model to guide it whether to apply peeling 1574 or versioning or a combination of the two. For example, the scheme that 1575 intel uses when given a loop with several memory accesses, is as follows: 1576 choose one memory access ('p') which alignment you want to force by doing 1577 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1578 other accesses are not necessarily aligned, or (2) use loop versioning to 1579 generate one loop in which all accesses are aligned, and another loop in 1580 which only 'p' is necessarily aligned. 1581 1582 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1583 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1584 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1585 1586 Devising a cost model is the most critical aspect of this work. It will 1587 guide us on which access to peel for, whether to use loop versioning, how 1588 many versions to create, etc. The cost model will probably consist of 1589 generic considerations as well as target specific considerations (on 1590 powerpc for example, misaligned stores are more painful than misaligned 1591 loads). 1592 1593 Here are the general steps involved in alignment enhancements: 1594 1595 -- original loop, before alignment analysis: 1596 for (i=0; i<N; i++){ 1597 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1598 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1599 } 1600 1601 -- After vect_compute_data_refs_alignment: 1602 for (i=0; i<N; i++){ 1603 x = q[i]; # DR_MISALIGNMENT(q) = 3 1604 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1605 } 1606 1607 -- Possibility 1: we do loop versioning: 1608 if (p is aligned) { 1609 for (i=0; i<N; i++){ # loop 1A 1610 x = q[i]; # DR_MISALIGNMENT(q) = 3 1611 p[i] = y; # DR_MISALIGNMENT(p) = 0 1612 } 1613 } 1614 else { 1615 for (i=0; i<N; i++){ # loop 1B 1616 x = q[i]; # DR_MISALIGNMENT(q) = 3 1617 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1618 } 1619 } 1620 1621 -- Possibility 2: we do loop peeling: 1622 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1623 x = q[i]; 1624 p[i] = y; 1625 } 1626 for (i = 3; i < N; i++){ # loop 2A 1627 x = q[i]; # DR_MISALIGNMENT(q) = 0 1628 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1629 } 1630 1631 -- Possibility 3: combination of loop peeling and versioning: 1632 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1633 x = q[i]; 1634 p[i] = y; 1635 } 1636 if (p is aligned) { 1637 for (i = 3; i<N; i++){ # loop 3A 1638 x = q[i]; # DR_MISALIGNMENT(q) = 0 1639 p[i] = y; # DR_MISALIGNMENT(p) = 0 1640 } 1641 } 1642 else { 1643 for (i = 3; i<N; i++){ # loop 3B 1644 x = q[i]; # DR_MISALIGNMENT(q) = 0 1645 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1646 } 1647 } 1648 1649 These loops are later passed to loop_transform to be vectorized. The 1650 vectorizer will use the alignment information to guide the transformation 1651 (whether to generate regular loads/stores, or with special handling for 1652 misalignment). */ 1653 1654 bool 1655 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1656 { 1657 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1659 enum dr_alignment_support supportable_dr_alignment; 1660 struct data_reference *dr0 = NULL, *first_store = NULL; 1661 struct data_reference *dr; 1662 unsigned int i, j; 1663 bool do_peeling = false; 1664 bool do_versioning = false; 1665 bool stat; 1666 gimple *stmt; 1667 stmt_vec_info stmt_info; 1668 unsigned int npeel = 0; 1669 bool one_misalignment_known = false; 1670 bool one_misalignment_unknown = false; 1671 bool one_dr_unsupportable = false; 1672 struct data_reference *unsupportable_dr = NULL; 1673 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1674 unsigned possible_npeel_number = 1; 1675 tree vectype; 1676 unsigned int mis, same_align_drs_max = 0; 1677 hash_table<peel_info_hasher> peeling_htab (1); 1678 1679 if (dump_enabled_p ()) 1680 dump_printf_loc (MSG_NOTE, vect_location, 1681 "=== vect_enhance_data_refs_alignment ===\n"); 1682 1683 /* Reset data so we can safely be called multiple times. */ 1684 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1685 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1686 1687 /* While cost model enhancements are expected in the future, the high level 1688 view of the code at this time is as follows: 1689 1690 A) If there is a misaligned access then see if peeling to align 1691 this access can make all data references satisfy 1692 vect_supportable_dr_alignment. If so, update data structures 1693 as needed and return true. 1694 1695 B) If peeling wasn't possible and there is a data reference with an 1696 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1697 then see if loop versioning checks can be used to make all data 1698 references satisfy vect_supportable_dr_alignment. If so, update 1699 data structures as needed and return true. 1700 1701 C) If neither peeling nor versioning were successful then return false if 1702 any data reference does not satisfy vect_supportable_dr_alignment. 1703 1704 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1705 1706 Note, Possibility 3 above (which is peeling and versioning together) is not 1707 being done at this time. */ 1708 1709 /* (1) Peeling to force alignment. */ 1710 1711 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1712 Considerations: 1713 + How many accesses will become aligned due to the peeling 1714 - How many accesses will become unaligned due to the peeling, 1715 and the cost of misaligned accesses. 1716 - The cost of peeling (the extra runtime checks, the increase 1717 in code size). */ 1718 1719 FOR_EACH_VEC_ELT (datarefs, i, dr) 1720 { 1721 stmt = DR_STMT (dr); 1722 stmt_info = vinfo_for_stmt (stmt); 1723 1724 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1725 continue; 1726 1727 /* For interleaving, only the alignment of the first access 1728 matters. */ 1729 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1730 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1731 continue; 1732 1733 /* For invariant accesses there is nothing to enhance. */ 1734 if (integer_zerop (DR_STEP (dr))) 1735 continue; 1736 1737 /* Strided accesses perform only component accesses, alignment is 1738 irrelevant for them. */ 1739 if (STMT_VINFO_STRIDED_P (stmt_info) 1740 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1741 continue; 1742 1743 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1744 do_peeling = vector_alignment_reachable_p (dr); 1745 if (do_peeling) 1746 { 1747 if (known_alignment_for_access_p (dr)) 1748 { 1749 unsigned int npeel_tmp = 0; 1750 bool negative = tree_int_cst_compare (DR_STEP (dr), 1751 size_zero_node) < 0; 1752 1753 vectype = STMT_VINFO_VECTYPE (stmt_info); 1754 unsigned int target_align = DR_TARGET_ALIGNMENT (dr); 1755 unsigned int dr_size = vect_get_scalar_dr_size (dr); 1756 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr)); 1757 if (DR_MISALIGNMENT (dr) != 0) 1758 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1759 1760 /* For multiple types, it is possible that the bigger type access 1761 will have more than one peeling option. E.g., a loop with two 1762 types: one of size (vector size / 4), and the other one of 1763 size (vector size / 8). Vectorization factor will 8. If both 1764 accesses are misaligned by 3, the first one needs one scalar 1765 iteration to be aligned, and the second one needs 5. But the 1766 first one will be aligned also by peeling 5 scalar 1767 iterations, and in that case both accesses will be aligned. 1768 Hence, except for the immediate peeling amount, we also want 1769 to try to add full vector size, while we don't exceed 1770 vectorization factor. 1771 We do this automatically for cost model, since we calculate 1772 cost for every peeling option. */ 1773 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1774 { 1775 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1776 ? vf * GROUP_SIZE (stmt_info) : vf); 1777 possible_npeel_number 1778 = vect_get_num_vectors (nscalars, vectype); 1779 1780 /* NPEEL_TMP is 0 when there is no misalignment, but also 1781 allow peeling NELEMENTS. */ 1782 if (DR_MISALIGNMENT (dr) == 0) 1783 possible_npeel_number++; 1784 } 1785 1786 /* Save info about DR in the hash table. Also include peeling 1787 amounts according to the explanation above. */ 1788 for (j = 0; j < possible_npeel_number; j++) 1789 { 1790 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1791 dr, npeel_tmp); 1792 npeel_tmp += target_align / dr_size; 1793 } 1794 1795 one_misalignment_known = true; 1796 } 1797 else 1798 { 1799 /* If we don't know any misalignment values, we prefer 1800 peeling for data-ref that has the maximum number of data-refs 1801 with the same alignment, unless the target prefers to align 1802 stores over load. */ 1803 unsigned same_align_drs 1804 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1805 if (!dr0 1806 || same_align_drs_max < same_align_drs) 1807 { 1808 same_align_drs_max = same_align_drs; 1809 dr0 = dr; 1810 } 1811 /* For data-refs with the same number of related 1812 accesses prefer the one where the misalign 1813 computation will be invariant in the outermost loop. */ 1814 else if (same_align_drs_max == same_align_drs) 1815 { 1816 struct loop *ivloop0, *ivloop; 1817 ivloop0 = outermost_invariant_loop_for_expr 1818 (loop, DR_BASE_ADDRESS (dr0)); 1819 ivloop = outermost_invariant_loop_for_expr 1820 (loop, DR_BASE_ADDRESS (dr)); 1821 if ((ivloop && !ivloop0) 1822 || (ivloop && ivloop0 1823 && flow_loop_nested_p (ivloop, ivloop0))) 1824 dr0 = dr; 1825 } 1826 1827 one_misalignment_unknown = true; 1828 1829 /* Check for data refs with unsupportable alignment that 1830 can be peeled. */ 1831 if (!supportable_dr_alignment) 1832 { 1833 one_dr_unsupportable = true; 1834 unsupportable_dr = dr; 1835 } 1836 1837 if (!first_store && DR_IS_WRITE (dr)) 1838 first_store = dr; 1839 } 1840 } 1841 else 1842 { 1843 if (!aligned_access_p (dr)) 1844 { 1845 if (dump_enabled_p ()) 1846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1847 "vector alignment may not be reachable\n"); 1848 break; 1849 } 1850 } 1851 } 1852 1853 /* Check if we can possibly peel the loop. */ 1854 if (!vect_can_advance_ivs_p (loop_vinfo) 1855 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1856 || loop->inner) 1857 do_peeling = false; 1858 1859 struct _vect_peel_extended_info peel_for_known_alignment; 1860 struct _vect_peel_extended_info peel_for_unknown_alignment; 1861 struct _vect_peel_extended_info best_peel; 1862 1863 peel_for_unknown_alignment.inside_cost = INT_MAX; 1864 peel_for_unknown_alignment.outside_cost = INT_MAX; 1865 peel_for_unknown_alignment.peel_info.count = 0; 1866 1867 if (do_peeling 1868 && one_misalignment_unknown) 1869 { 1870 /* Check if the target requires to prefer stores over loads, i.e., if 1871 misaligned stores are more expensive than misaligned loads (taking 1872 drs with same alignment into account). */ 1873 unsigned int load_inside_cost = 0; 1874 unsigned int load_outside_cost = 0; 1875 unsigned int store_inside_cost = 0; 1876 unsigned int store_outside_cost = 0; 1877 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1878 1879 stmt_vector_for_cost dummy; 1880 dummy.create (2); 1881 vect_get_peeling_costs_all_drs (datarefs, dr0, 1882 &load_inside_cost, 1883 &load_outside_cost, 1884 &dummy, estimated_npeels, true); 1885 dummy.release (); 1886 1887 if (first_store) 1888 { 1889 dummy.create (2); 1890 vect_get_peeling_costs_all_drs (datarefs, first_store, 1891 &store_inside_cost, 1892 &store_outside_cost, 1893 &dummy, estimated_npeels, true); 1894 dummy.release (); 1895 } 1896 else 1897 { 1898 store_inside_cost = INT_MAX; 1899 store_outside_cost = INT_MAX; 1900 } 1901 1902 if (load_inside_cost > store_inside_cost 1903 || (load_inside_cost == store_inside_cost 1904 && load_outside_cost > store_outside_cost)) 1905 { 1906 dr0 = first_store; 1907 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1908 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1909 } 1910 else 1911 { 1912 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1913 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1914 } 1915 1916 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1917 prologue_cost_vec.create (2); 1918 epilogue_cost_vec.create (2); 1919 1920 int dummy2; 1921 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1922 (loop_vinfo, estimated_npeels, &dummy2, 1923 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1924 &prologue_cost_vec, &epilogue_cost_vec); 1925 1926 prologue_cost_vec.release (); 1927 epilogue_cost_vec.release (); 1928 1929 peel_for_unknown_alignment.peel_info.count = 1 1930 + STMT_VINFO_SAME_ALIGN_REFS 1931 (vinfo_for_stmt (DR_STMT (dr0))).length (); 1932 } 1933 1934 peel_for_unknown_alignment.peel_info.npeel = 0; 1935 peel_for_unknown_alignment.peel_info.dr = dr0; 1936 1937 best_peel = peel_for_unknown_alignment; 1938 1939 peel_for_known_alignment.inside_cost = INT_MAX; 1940 peel_for_known_alignment.outside_cost = INT_MAX; 1941 peel_for_known_alignment.peel_info.count = 0; 1942 peel_for_known_alignment.peel_info.dr = NULL; 1943 1944 if (do_peeling && one_misalignment_known) 1945 { 1946 /* Peeling is possible, but there is no data access that is not supported 1947 unless aligned. So we try to choose the best possible peeling from 1948 the hash table. */ 1949 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 1950 (&peeling_htab, loop_vinfo); 1951 } 1952 1953 /* Compare costs of peeling for known and unknown alignment. */ 1954 if (peel_for_known_alignment.peel_info.dr != NULL 1955 && peel_for_unknown_alignment.inside_cost 1956 >= peel_for_known_alignment.inside_cost) 1957 { 1958 best_peel = peel_for_known_alignment; 1959 1960 /* If the best peeling for known alignment has NPEEL == 0, perform no 1961 peeling at all except if there is an unsupportable dr that we can 1962 align. */ 1963 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 1964 do_peeling = false; 1965 } 1966 1967 /* If there is an unsupportable data ref, prefer this over all choices so far 1968 since we'd have to discard a chosen peeling except when it accidentally 1969 aligned the unsupportable data ref. */ 1970 if (one_dr_unsupportable) 1971 dr0 = unsupportable_dr; 1972 else if (do_peeling) 1973 { 1974 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 1975 TODO: Use nopeel_outside_cost or get rid of it? */ 1976 unsigned nopeel_inside_cost = 0; 1977 unsigned nopeel_outside_cost = 0; 1978 1979 stmt_vector_for_cost dummy; 1980 dummy.create (2); 1981 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost, 1982 &nopeel_outside_cost, &dummy, 0, false); 1983 dummy.release (); 1984 1985 /* Add epilogue costs. As we do not peel for alignment here, no prologue 1986 costs will be recorded. */ 1987 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1988 prologue_cost_vec.create (2); 1989 epilogue_cost_vec.create (2); 1990 1991 int dummy2; 1992 nopeel_outside_cost += vect_get_known_peeling_cost 1993 (loop_vinfo, 0, &dummy2, 1994 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1995 &prologue_cost_vec, &epilogue_cost_vec); 1996 1997 prologue_cost_vec.release (); 1998 epilogue_cost_vec.release (); 1999 2000 npeel = best_peel.peel_info.npeel; 2001 dr0 = best_peel.peel_info.dr; 2002 2003 /* If no peeling is not more expensive than the best peeling we 2004 have so far, don't perform any peeling. */ 2005 if (nopeel_inside_cost <= best_peel.inside_cost) 2006 do_peeling = false; 2007 } 2008 2009 if (do_peeling) 2010 { 2011 stmt = DR_STMT (dr0); 2012 stmt_info = vinfo_for_stmt (stmt); 2013 vectype = STMT_VINFO_VECTYPE (stmt_info); 2014 2015 if (known_alignment_for_access_p (dr0)) 2016 { 2017 bool negative = tree_int_cst_compare (DR_STEP (dr0), 2018 size_zero_node) < 0; 2019 if (!npeel) 2020 { 2021 /* Since it's known at compile time, compute the number of 2022 iterations in the peeled loop (the peeling factor) for use in 2023 updating DR_MISALIGNMENT values. The peeling factor is the 2024 vectorization factor minus the misalignment as an element 2025 count. */ 2026 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0); 2027 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2028 npeel = ((mis & (target_align - 1)) 2029 / vect_get_scalar_dr_size (dr0)); 2030 } 2031 2032 /* For interleaved data access every iteration accesses all the 2033 members of the group, therefore we divide the number of iterations 2034 by the group size. */ 2035 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 2036 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2037 npeel /= GROUP_SIZE (stmt_info); 2038 2039 if (dump_enabled_p ()) 2040 dump_printf_loc (MSG_NOTE, vect_location, 2041 "Try peeling by %d\n", npeel); 2042 } 2043 2044 /* Ensure that all datarefs can be vectorized after the peel. */ 2045 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel)) 2046 do_peeling = false; 2047 2048 /* Check if all datarefs are supportable and log. */ 2049 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 2050 { 2051 stat = vect_verify_datarefs_alignment (loop_vinfo); 2052 if (!stat) 2053 do_peeling = false; 2054 else 2055 return stat; 2056 } 2057 2058 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2059 if (do_peeling) 2060 { 2061 unsigned max_allowed_peel 2062 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 2063 if (max_allowed_peel != (unsigned)-1) 2064 { 2065 unsigned max_peel = npeel; 2066 if (max_peel == 0) 2067 { 2068 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0); 2069 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1; 2070 } 2071 if (max_peel > max_allowed_peel) 2072 { 2073 do_peeling = false; 2074 if (dump_enabled_p ()) 2075 dump_printf_loc (MSG_NOTE, vect_location, 2076 "Disable peeling, max peels reached: %d\n", max_peel); 2077 } 2078 } 2079 } 2080 2081 /* Cost model #2 - if peeling may result in a remaining loop not 2082 iterating enough to be vectorized then do not peel. Since this 2083 is a cost heuristic rather than a correctness decision, use the 2084 most likely runtime value for variable vectorization factors. */ 2085 if (do_peeling 2086 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2087 { 2088 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2089 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2090 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2091 < assumed_vf + max_peel) 2092 do_peeling = false; 2093 } 2094 2095 if (do_peeling) 2096 { 2097 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2098 If the misalignment of DR_i is identical to that of dr0 then set 2099 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2100 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2101 by the peeling factor times the element size of DR_i (MOD the 2102 vectorization factor times the size). Otherwise, the 2103 misalignment of DR_i must be set to unknown. */ 2104 FOR_EACH_VEC_ELT (datarefs, i, dr) 2105 if (dr != dr0) 2106 { 2107 /* Strided accesses perform only component accesses, alignment 2108 is irrelevant for them. */ 2109 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2110 if (STMT_VINFO_STRIDED_P (stmt_info) 2111 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2112 continue; 2113 2114 vect_update_misalignment_for_peel (dr, dr0, npeel); 2115 } 2116 2117 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 2118 if (npeel) 2119 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2120 else 2121 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2122 = DR_MISALIGNMENT (dr0); 2123 SET_DR_MISALIGNMENT (dr0, 0); 2124 if (dump_enabled_p ()) 2125 { 2126 dump_printf_loc (MSG_NOTE, vect_location, 2127 "Alignment of access forced using peeling.\n"); 2128 dump_printf_loc (MSG_NOTE, vect_location, 2129 "Peeling for alignment will be applied.\n"); 2130 } 2131 2132 /* The inside-loop cost will be accounted for in vectorizable_load 2133 and vectorizable_store correctly with adjusted alignments. 2134 Drop the body_cst_vec on the floor here. */ 2135 stat = vect_verify_datarefs_alignment (loop_vinfo); 2136 gcc_assert (stat); 2137 return stat; 2138 } 2139 } 2140 2141 /* (2) Versioning to force alignment. */ 2142 2143 /* Try versioning if: 2144 1) optimize loop for speed 2145 2) there is at least one unsupported misaligned data ref with an unknown 2146 misalignment, and 2147 3) all misaligned data refs with a known misalignment are supported, and 2148 4) the number of runtime alignment checks is within reason. */ 2149 2150 do_versioning = 2151 optimize_loop_nest_for_speed_p (loop) 2152 && (!loop->inner); /* FORNOW */ 2153 2154 if (do_versioning) 2155 { 2156 FOR_EACH_VEC_ELT (datarefs, i, dr) 2157 { 2158 stmt = DR_STMT (dr); 2159 stmt_info = vinfo_for_stmt (stmt); 2160 2161 /* For interleaving, only the alignment of the first access 2162 matters. */ 2163 if (aligned_access_p (dr) 2164 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2166 continue; 2167 2168 if (STMT_VINFO_STRIDED_P (stmt_info)) 2169 { 2170 /* Strided loads perform only component accesses, alignment is 2171 irrelevant for them. */ 2172 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2173 continue; 2174 do_versioning = false; 2175 break; 2176 } 2177 2178 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2179 2180 if (!supportable_dr_alignment) 2181 { 2182 gimple *stmt; 2183 int mask; 2184 tree vectype; 2185 2186 if (known_alignment_for_access_p (dr) 2187 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2188 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2189 { 2190 do_versioning = false; 2191 break; 2192 } 2193 2194 stmt = DR_STMT (dr); 2195 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2196 gcc_assert (vectype); 2197 2198 /* At present we don't support versioning for alignment 2199 with variable VF, since there's no guarantee that the 2200 VF is a power of two. We could relax this if we added 2201 a way of enforcing a power-of-two size. */ 2202 unsigned HOST_WIDE_INT size; 2203 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2204 { 2205 do_versioning = false; 2206 break; 2207 } 2208 2209 /* The rightmost bits of an aligned address must be zeros. 2210 Construct the mask needed for this test. For example, 2211 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2212 mask must be 15 = 0xf. */ 2213 mask = size - 1; 2214 2215 /* FORNOW: use the same mask to test all potentially unaligned 2216 references in the loop. The vectorizer currently supports 2217 a single vector size, see the reference to 2218 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2219 vectorization factor is computed. */ 2220 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2221 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2222 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2223 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2224 DR_STMT (dr)); 2225 } 2226 } 2227 2228 /* Versioning requires at least one misaligned data reference. */ 2229 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2230 do_versioning = false; 2231 else if (!do_versioning) 2232 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2233 } 2234 2235 if (do_versioning) 2236 { 2237 vec<gimple *> may_misalign_stmts 2238 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2239 gimple *stmt; 2240 2241 /* It can now be assumed that the data references in the statements 2242 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2243 of the loop being vectorized. */ 2244 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2245 { 2246 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2247 dr = STMT_VINFO_DATA_REF (stmt_info); 2248 SET_DR_MISALIGNMENT (dr, 0); 2249 if (dump_enabled_p ()) 2250 dump_printf_loc (MSG_NOTE, vect_location, 2251 "Alignment of access forced using versioning.\n"); 2252 } 2253 2254 if (dump_enabled_p ()) 2255 dump_printf_loc (MSG_NOTE, vect_location, 2256 "Versioning for alignment will be applied.\n"); 2257 2258 /* Peeling and versioning can't be done together at this time. */ 2259 gcc_assert (! (do_peeling && do_versioning)); 2260 2261 stat = vect_verify_datarefs_alignment (loop_vinfo); 2262 gcc_assert (stat); 2263 return stat; 2264 } 2265 2266 /* This point is reached if neither peeling nor versioning is being done. */ 2267 gcc_assert (! (do_peeling || do_versioning)); 2268 2269 stat = vect_verify_datarefs_alignment (loop_vinfo); 2270 return stat; 2271 } 2272 2273 2274 /* Function vect_find_same_alignment_drs. 2275 2276 Update group and alignment relations according to the chosen 2277 vectorization factor. */ 2278 2279 static void 2280 vect_find_same_alignment_drs (struct data_dependence_relation *ddr) 2281 { 2282 struct data_reference *dra = DDR_A (ddr); 2283 struct data_reference *drb = DDR_B (ddr); 2284 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2285 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2286 2287 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2288 return; 2289 2290 if (dra == drb) 2291 return; 2292 2293 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2294 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2295 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2296 return; 2297 2298 /* Two references with distance zero have the same alignment. */ 2299 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2300 - wi::to_poly_offset (DR_INIT (drb))); 2301 if (maybe_ne (diff, 0)) 2302 { 2303 /* Get the wider of the two alignments. */ 2304 unsigned int align_a = (vect_calculate_target_alignment (dra) 2305 / BITS_PER_UNIT); 2306 unsigned int align_b = (vect_calculate_target_alignment (drb) 2307 / BITS_PER_UNIT); 2308 unsigned int max_align = MAX (align_a, align_b); 2309 2310 /* Require the gap to be a multiple of the larger vector alignment. */ 2311 if (!multiple_p (diff, max_align)) 2312 return; 2313 } 2314 2315 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2316 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2317 if (dump_enabled_p ()) 2318 { 2319 dump_printf_loc (MSG_NOTE, vect_location, 2320 "accesses have the same alignment: "); 2321 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2322 dump_printf (MSG_NOTE, " and "); 2323 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2324 dump_printf (MSG_NOTE, "\n"); 2325 } 2326 } 2327 2328 2329 /* Function vect_analyze_data_refs_alignment 2330 2331 Analyze the alignment of the data-references in the loop. 2332 Return FALSE if a data reference is found that cannot be vectorized. */ 2333 2334 bool 2335 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2336 { 2337 if (dump_enabled_p ()) 2338 dump_printf_loc (MSG_NOTE, vect_location, 2339 "=== vect_analyze_data_refs_alignment ===\n"); 2340 2341 /* Mark groups of data references with same alignment using 2342 data dependence information. */ 2343 vec<ddr_p> ddrs = vinfo->ddrs; 2344 struct data_dependence_relation *ddr; 2345 unsigned int i; 2346 2347 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2348 vect_find_same_alignment_drs (ddr); 2349 2350 vec<data_reference_p> datarefs = vinfo->datarefs; 2351 struct data_reference *dr; 2352 2353 vect_record_base_alignments (vinfo); 2354 FOR_EACH_VEC_ELT (datarefs, i, dr) 2355 { 2356 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2357 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2358 && !vect_compute_data_ref_alignment (dr)) 2359 { 2360 /* Strided accesses perform only component accesses, misalignment 2361 information is irrelevant for them. */ 2362 if (STMT_VINFO_STRIDED_P (stmt_info) 2363 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2364 continue; 2365 2366 if (dump_enabled_p ()) 2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2368 "not vectorized: can't calculate alignment " 2369 "for data ref.\n"); 2370 2371 return false; 2372 } 2373 } 2374 2375 return true; 2376 } 2377 2378 2379 /* Analyze alignment of DRs of stmts in NODE. */ 2380 2381 static bool 2382 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2383 { 2384 /* We vectorize from the first scalar stmt in the node unless 2385 the node is permuted in which case we start from the first 2386 element in the group. */ 2387 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2388 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2389 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2390 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2391 2392 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2393 if (! vect_compute_data_ref_alignment (dr) 2394 /* For creating the data-ref pointer we need alignment of the 2395 first element anyway. */ 2396 || (dr != first_dr 2397 && ! vect_compute_data_ref_alignment (first_dr)) 2398 || ! verify_data_ref_alignment (dr)) 2399 { 2400 if (dump_enabled_p ()) 2401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2402 "not vectorized: bad data alignment in basic " 2403 "block.\n"); 2404 return false; 2405 } 2406 2407 return true; 2408 } 2409 2410 /* Function vect_slp_analyze_instance_alignment 2411 2412 Analyze the alignment of the data-references in the SLP instance. 2413 Return FALSE if a data reference is found that cannot be vectorized. */ 2414 2415 bool 2416 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2417 { 2418 if (dump_enabled_p ()) 2419 dump_printf_loc (MSG_NOTE, vect_location, 2420 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2421 2422 slp_tree node; 2423 unsigned i; 2424 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2425 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2426 return false; 2427 2428 node = SLP_INSTANCE_TREE (instance); 2429 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2430 && ! vect_slp_analyze_and_verify_node_alignment 2431 (SLP_INSTANCE_TREE (instance))) 2432 return false; 2433 2434 return true; 2435 } 2436 2437 2438 /* Analyze groups of accesses: check that DR belongs to a group of 2439 accesses of legal size, step, etc. Detect gaps, single element 2440 interleaving, and other special cases. Set grouped access info. 2441 Collect groups of strided stores for further use in SLP analysis. 2442 Worker for vect_analyze_group_access. */ 2443 2444 static bool 2445 vect_analyze_group_access_1 (struct data_reference *dr) 2446 { 2447 tree step = DR_STEP (dr); 2448 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2449 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2450 gimple *stmt = DR_STMT (dr); 2451 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2452 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2453 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2454 HOST_WIDE_INT dr_step = -1; 2455 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2456 bool slp_impossible = false; 2457 2458 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2459 size of the interleaving group (including gaps). */ 2460 if (tree_fits_shwi_p (step)) 2461 { 2462 dr_step = tree_to_shwi (step); 2463 /* Check that STEP is a multiple of type size. Otherwise there is 2464 a non-element-sized gap at the end of the group which we 2465 cannot represent in GROUP_GAP or GROUP_SIZE. 2466 ??? As we can handle non-constant step fine here we should 2467 simply remove uses of GROUP_GAP between the last and first 2468 element and instead rely on DR_STEP. GROUP_SIZE then would 2469 simply not include that gap. */ 2470 if ((dr_step % type_size) != 0) 2471 { 2472 if (dump_enabled_p ()) 2473 { 2474 dump_printf_loc (MSG_NOTE, vect_location, 2475 "Step "); 2476 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2477 dump_printf (MSG_NOTE, 2478 " is not a multiple of the element size for "); 2479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2480 dump_printf (MSG_NOTE, "\n"); 2481 } 2482 return false; 2483 } 2484 groupsize = absu_hwi (dr_step) / type_size; 2485 } 2486 else 2487 groupsize = 0; 2488 2489 /* Not consecutive access is possible only if it is a part of interleaving. */ 2490 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2491 { 2492 /* Check if it this DR is a part of interleaving, and is a single 2493 element of the group that is accessed in the loop. */ 2494 2495 /* Gaps are supported only for loads. STEP must be a multiple of the type 2496 size. */ 2497 if (DR_IS_READ (dr) 2498 && (dr_step % type_size) == 0 2499 && groupsize > 0) 2500 { 2501 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2502 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2503 GROUP_GAP (stmt_info) = groupsize - 1; 2504 if (dump_enabled_p ()) 2505 { 2506 dump_printf_loc (MSG_NOTE, vect_location, 2507 "Detected single element interleaving "); 2508 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2509 dump_printf (MSG_NOTE, " step "); 2510 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2511 dump_printf (MSG_NOTE, "\n"); 2512 } 2513 2514 return true; 2515 } 2516 2517 if (dump_enabled_p ()) 2518 { 2519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2520 "not consecutive access "); 2521 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2522 } 2523 2524 if (bb_vinfo) 2525 { 2526 /* Mark the statement as unvectorizable. */ 2527 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2528 return true; 2529 } 2530 2531 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2532 STMT_VINFO_STRIDED_P (stmt_info) = true; 2533 return true; 2534 } 2535 2536 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2537 { 2538 /* First stmt in the interleaving chain. Check the chain. */ 2539 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2540 struct data_reference *data_ref = dr; 2541 unsigned int count = 1; 2542 tree prev_init = DR_INIT (data_ref); 2543 gimple *prev = stmt; 2544 HOST_WIDE_INT diff, gaps = 0; 2545 2546 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2547 while (next) 2548 { 2549 /* Skip same data-refs. In case that two or more stmts share 2550 data-ref (supported only for loads), we vectorize only the first 2551 stmt, and the rest get their vectorized loads from the first 2552 one. */ 2553 if (!tree_int_cst_compare (DR_INIT (data_ref), 2554 DR_INIT (STMT_VINFO_DATA_REF ( 2555 vinfo_for_stmt (next))))) 2556 { 2557 if (DR_IS_WRITE (data_ref)) 2558 { 2559 if (dump_enabled_p ()) 2560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2561 "Two store stmts share the same dr.\n"); 2562 return false; 2563 } 2564 2565 if (dump_enabled_p ()) 2566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2567 "Two or more load stmts share the same dr.\n"); 2568 2569 /* For load use the same data-ref load. */ 2570 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2571 2572 prev = next; 2573 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2574 continue; 2575 } 2576 2577 prev = next; 2578 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2579 2580 /* All group members have the same STEP by construction. */ 2581 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2582 2583 /* Check that the distance between two accesses is equal to the type 2584 size. Otherwise, we have gaps. */ 2585 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2586 - TREE_INT_CST_LOW (prev_init)) / type_size; 2587 if (diff != 1) 2588 { 2589 /* FORNOW: SLP of accesses with gaps is not supported. */ 2590 slp_impossible = true; 2591 if (DR_IS_WRITE (data_ref)) 2592 { 2593 if (dump_enabled_p ()) 2594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2595 "interleaved store with gaps\n"); 2596 return false; 2597 } 2598 2599 gaps += diff - 1; 2600 } 2601 2602 last_accessed_element += diff; 2603 2604 /* Store the gap from the previous member of the group. If there is no 2605 gap in the access, GROUP_GAP is always 1. */ 2606 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2607 2608 prev_init = DR_INIT (data_ref); 2609 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2610 /* Count the number of data-refs in the chain. */ 2611 count++; 2612 } 2613 2614 if (groupsize == 0) 2615 groupsize = count + gaps; 2616 2617 /* This could be UINT_MAX but as we are generating code in a very 2618 inefficient way we have to cap earlier. See PR78699 for example. */ 2619 if (groupsize > 4096) 2620 { 2621 if (dump_enabled_p ()) 2622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2623 "group is too large\n"); 2624 return false; 2625 } 2626 2627 /* Check that the size of the interleaving is equal to count for stores, 2628 i.e., that there are no gaps. */ 2629 if (groupsize != count 2630 && !DR_IS_READ (dr)) 2631 { 2632 if (dump_enabled_p ()) 2633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2634 "interleaved store with gaps\n"); 2635 return false; 2636 } 2637 2638 /* If there is a gap after the last load in the group it is the 2639 difference between the groupsize and the last accessed 2640 element. 2641 When there is no gap, this difference should be 0. */ 2642 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2643 2644 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2645 if (dump_enabled_p ()) 2646 { 2647 dump_printf_loc (MSG_NOTE, vect_location, 2648 "Detected interleaving "); 2649 if (DR_IS_READ (dr)) 2650 dump_printf (MSG_NOTE, "load "); 2651 else 2652 dump_printf (MSG_NOTE, "store "); 2653 dump_printf (MSG_NOTE, "of size %u starting with ", 2654 (unsigned)groupsize); 2655 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2656 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2657 dump_printf_loc (MSG_NOTE, vect_location, 2658 "There is a gap of %u elements after the group\n", 2659 GROUP_GAP (vinfo_for_stmt (stmt))); 2660 } 2661 2662 /* SLP: create an SLP data structure for every interleaving group of 2663 stores for further analysis in vect_analyse_slp. */ 2664 if (DR_IS_WRITE (dr) && !slp_impossible) 2665 { 2666 if (loop_vinfo) 2667 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2668 if (bb_vinfo) 2669 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2670 } 2671 } 2672 2673 return true; 2674 } 2675 2676 /* Analyze groups of accesses: check that DR belongs to a group of 2677 accesses of legal size, step, etc. Detect gaps, single element 2678 interleaving, and other special cases. Set grouped access info. 2679 Collect groups of strided stores for further use in SLP analysis. */ 2680 2681 static bool 2682 vect_analyze_group_access (struct data_reference *dr) 2683 { 2684 if (!vect_analyze_group_access_1 (dr)) 2685 { 2686 /* Dissolve the group if present. */ 2687 gimple *next; 2688 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2689 while (stmt) 2690 { 2691 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2692 next = GROUP_NEXT_ELEMENT (vinfo); 2693 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2694 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2695 stmt = next; 2696 } 2697 return false; 2698 } 2699 return true; 2700 } 2701 2702 /* Analyze the access pattern of the data-reference DR. 2703 In case of non-consecutive accesses call vect_analyze_group_access() to 2704 analyze groups of accesses. */ 2705 2706 static bool 2707 vect_analyze_data_ref_access (struct data_reference *dr) 2708 { 2709 tree step = DR_STEP (dr); 2710 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2711 gimple *stmt = DR_STMT (dr); 2712 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2713 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2714 struct loop *loop = NULL; 2715 2716 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2717 return true; 2718 2719 if (loop_vinfo) 2720 loop = LOOP_VINFO_LOOP (loop_vinfo); 2721 2722 if (loop_vinfo && !step) 2723 { 2724 if (dump_enabled_p ()) 2725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2726 "bad data-ref access in loop\n"); 2727 return false; 2728 } 2729 2730 /* Allow loads with zero step in inner-loop vectorization. */ 2731 if (loop_vinfo && integer_zerop (step)) 2732 { 2733 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2734 if (!nested_in_vect_loop_p (loop, stmt)) 2735 return DR_IS_READ (dr); 2736 /* Allow references with zero step for outer loops marked 2737 with pragma omp simd only - it guarantees absence of 2738 loop-carried dependencies between inner loop iterations. */ 2739 if (loop->safelen < 2) 2740 { 2741 if (dump_enabled_p ()) 2742 dump_printf_loc (MSG_NOTE, vect_location, 2743 "zero step in inner loop of nest\n"); 2744 return false; 2745 } 2746 } 2747 2748 if (loop && nested_in_vect_loop_p (loop, stmt)) 2749 { 2750 /* Interleaved accesses are not yet supported within outer-loop 2751 vectorization for references in the inner-loop. */ 2752 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2753 2754 /* For the rest of the analysis we use the outer-loop step. */ 2755 step = STMT_VINFO_DR_STEP (stmt_info); 2756 if (integer_zerop (step)) 2757 { 2758 if (dump_enabled_p ()) 2759 dump_printf_loc (MSG_NOTE, vect_location, 2760 "zero step in outer loop.\n"); 2761 return DR_IS_READ (dr); 2762 } 2763 } 2764 2765 /* Consecutive? */ 2766 if (TREE_CODE (step) == INTEGER_CST) 2767 { 2768 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2769 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2770 || (dr_step < 0 2771 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2772 { 2773 /* Mark that it is not interleaving. */ 2774 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2775 return true; 2776 } 2777 } 2778 2779 if (loop && nested_in_vect_loop_p (loop, stmt)) 2780 { 2781 if (dump_enabled_p ()) 2782 dump_printf_loc (MSG_NOTE, vect_location, 2783 "grouped access in outer loop.\n"); 2784 return false; 2785 } 2786 2787 2788 /* Assume this is a DR handled by non-constant strided load case. */ 2789 if (TREE_CODE (step) != INTEGER_CST) 2790 return (STMT_VINFO_STRIDED_P (stmt_info) 2791 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2792 || vect_analyze_group_access (dr))); 2793 2794 /* Not consecutive access - check if it's a part of interleaving group. */ 2795 return vect_analyze_group_access (dr); 2796 } 2797 2798 /* Compare two data-references DRA and DRB to group them into chunks 2799 suitable for grouping. */ 2800 2801 static int 2802 dr_group_sort_cmp (const void *dra_, const void *drb_) 2803 { 2804 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2805 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2806 int cmp; 2807 2808 /* Stabilize sort. */ 2809 if (dra == drb) 2810 return 0; 2811 2812 /* DRs in different loops never belong to the same group. */ 2813 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2814 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2815 if (loopa != loopb) 2816 return loopa->num < loopb->num ? -1 : 1; 2817 2818 /* Ordering of DRs according to base. */ 2819 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2820 DR_BASE_ADDRESS (drb)); 2821 if (cmp != 0) 2822 return cmp; 2823 2824 /* And according to DR_OFFSET. */ 2825 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2826 if (cmp != 0) 2827 return cmp; 2828 2829 /* Put reads before writes. */ 2830 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2831 return DR_IS_READ (dra) ? -1 : 1; 2832 2833 /* Then sort after access size. */ 2834 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2835 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2836 if (cmp != 0) 2837 return cmp; 2838 2839 /* And after step. */ 2840 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2841 if (cmp != 0) 2842 return cmp; 2843 2844 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2845 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2846 if (cmp == 0) 2847 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2848 return cmp; 2849 } 2850 2851 /* If OP is the result of a conversion, return the unconverted value, 2852 otherwise return null. */ 2853 2854 static tree 2855 strip_conversion (tree op) 2856 { 2857 if (TREE_CODE (op) != SSA_NAME) 2858 return NULL_TREE; 2859 gimple *stmt = SSA_NAME_DEF_STMT (op); 2860 if (!is_gimple_assign (stmt) 2861 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2862 return NULL_TREE; 2863 return gimple_assign_rhs1 (stmt); 2864 } 2865 2866 /* Return true if vectorizable_* routines can handle statements STMT1 2867 and STMT2 being in a single group. */ 2868 2869 static bool 2870 can_group_stmts_p (gimple *stmt1, gimple *stmt2) 2871 { 2872 if (gimple_assign_single_p (stmt1)) 2873 return gimple_assign_single_p (stmt2); 2874 2875 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1)) 2876 { 2877 /* Check for two masked loads or two masked stores. */ 2878 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2)) 2879 return false; 2880 internal_fn ifn = gimple_call_internal_fn (stmt1); 2881 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2882 return false; 2883 if (ifn != gimple_call_internal_fn (stmt2)) 2884 return false; 2885 2886 /* Check that the masks are the same. Cope with casts of masks, 2887 like those created by build_mask_conversion. */ 2888 tree mask1 = gimple_call_arg (stmt1, 2); 2889 tree mask2 = gimple_call_arg (stmt2, 2); 2890 if (!operand_equal_p (mask1, mask2, 0)) 2891 { 2892 mask1 = strip_conversion (mask1); 2893 if (!mask1) 2894 return false; 2895 mask2 = strip_conversion (mask2); 2896 if (!mask2) 2897 return false; 2898 if (!operand_equal_p (mask1, mask2, 0)) 2899 return false; 2900 } 2901 return true; 2902 } 2903 2904 return false; 2905 } 2906 2907 /* Function vect_analyze_data_ref_accesses. 2908 2909 Analyze the access pattern of all the data references in the loop. 2910 2911 FORNOW: the only access pattern that is considered vectorizable is a 2912 simple step 1 (consecutive) access. 2913 2914 FORNOW: handle only arrays and pointer accesses. */ 2915 2916 bool 2917 vect_analyze_data_ref_accesses (vec_info *vinfo) 2918 { 2919 unsigned int i; 2920 vec<data_reference_p> datarefs = vinfo->datarefs; 2921 struct data_reference *dr; 2922 2923 if (dump_enabled_p ()) 2924 dump_printf_loc (MSG_NOTE, vect_location, 2925 "=== vect_analyze_data_ref_accesses ===\n"); 2926 2927 if (datarefs.is_empty ()) 2928 return true; 2929 2930 /* Sort the array of datarefs to make building the interleaving chains 2931 linear. Don't modify the original vector's order, it is needed for 2932 determining what dependencies are reversed. */ 2933 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2934 datarefs_copy.qsort (dr_group_sort_cmp); 2935 2936 /* Build the interleaving chains. */ 2937 for (i = 0; i < datarefs_copy.length () - 1;) 2938 { 2939 data_reference_p dra = datarefs_copy[i]; 2940 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2941 stmt_vec_info lastinfo = NULL; 2942 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 2943 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 2944 { 2945 ++i; 2946 continue; 2947 } 2948 for (i = i + 1; i < datarefs_copy.length (); ++i) 2949 { 2950 data_reference_p drb = datarefs_copy[i]; 2951 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2952 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 2953 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2954 break; 2955 2956 /* ??? Imperfect sorting (non-compatible types, non-modulo 2957 accesses, same accesses) can lead to a group to be artificially 2958 split here as we don't just skip over those. If it really 2959 matters we can push those to a worklist and re-iterate 2960 over them. The we can just skip ahead to the next DR here. */ 2961 2962 /* DRs in a different loop should not be put into the same 2963 interleaving group. */ 2964 if (gimple_bb (DR_STMT (dra))->loop_father 2965 != gimple_bb (DR_STMT (drb))->loop_father) 2966 break; 2967 2968 /* Check that the data-refs have same first location (except init) 2969 and they are both either store or load (not load and store, 2970 not masked loads or stores). */ 2971 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2972 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2973 DR_BASE_ADDRESS (drb)) != 0 2974 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 2975 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb))) 2976 break; 2977 2978 /* Check that the data-refs have the same constant size. */ 2979 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2980 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2981 if (!tree_fits_uhwi_p (sza) 2982 || !tree_fits_uhwi_p (szb) 2983 || !tree_int_cst_equal (sza, szb)) 2984 break; 2985 2986 /* Check that the data-refs have the same step. */ 2987 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 2988 break; 2989 2990 /* Check the types are compatible. 2991 ??? We don't distinguish this during sorting. */ 2992 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2993 TREE_TYPE (DR_REF (drb)))) 2994 break; 2995 2996 /* Check that the DR_INITs are compile-time constants. */ 2997 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 2998 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 2999 break; 3000 3001 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 3002 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 3003 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 3004 HOST_WIDE_INT init_prev 3005 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 3006 gcc_assert (init_a <= init_b 3007 && init_a <= init_prev 3008 && init_prev <= init_b); 3009 3010 /* Do not place the same access in the interleaving chain twice. */ 3011 if (init_b == init_prev) 3012 { 3013 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3014 < gimple_uid (DR_STMT (drb))); 3015 /* ??? For now we simply "drop" the later reference which is 3016 otherwise the same rather than finishing off this group. 3017 In the end we'd want to re-process duplicates forming 3018 multiple groups from the refs, likely by just collecting 3019 all candidates (including duplicates and split points 3020 below) in a vector and then process them together. */ 3021 continue; 3022 } 3023 3024 /* If init_b == init_a + the size of the type * k, we have an 3025 interleaving, and DRA is accessed before DRB. */ 3026 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3027 if (type_size_a == 0 3028 || (init_b - init_a) % type_size_a != 0) 3029 break; 3030 3031 /* If we have a store, the accesses are adjacent. This splits 3032 groups into chunks we support (we don't support vectorization 3033 of stores with gaps). */ 3034 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3035 break; 3036 3037 /* If the step (if not zero or non-constant) is greater than the 3038 difference between data-refs' inits this splits groups into 3039 suitable sizes. */ 3040 if (tree_fits_shwi_p (DR_STEP (dra))) 3041 { 3042 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3043 if (step != 0 && step <= (init_b - init_a)) 3044 break; 3045 } 3046 3047 if (dump_enabled_p ()) 3048 { 3049 dump_printf_loc (MSG_NOTE, vect_location, 3050 "Detected interleaving "); 3051 if (DR_IS_READ (dra)) 3052 dump_printf (MSG_NOTE, "load "); 3053 else 3054 dump_printf (MSG_NOTE, "store "); 3055 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 3056 dump_printf (MSG_NOTE, " and "); 3057 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 3058 dump_printf (MSG_NOTE, "\n"); 3059 } 3060 3061 /* Link the found element into the group list. */ 3062 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 3063 { 3064 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 3065 lastinfo = stmtinfo_a; 3066 } 3067 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 3068 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 3069 lastinfo = stmtinfo_b; 3070 } 3071 } 3072 3073 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3074 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 3075 && !vect_analyze_data_ref_access (dr)) 3076 { 3077 if (dump_enabled_p ()) 3078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3079 "not vectorized: complicated access pattern.\n"); 3080 3081 if (is_a <bb_vec_info> (vinfo)) 3082 { 3083 /* Mark the statement as not vectorizable. */ 3084 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 3085 continue; 3086 } 3087 else 3088 { 3089 datarefs_copy.release (); 3090 return false; 3091 } 3092 } 3093 3094 datarefs_copy.release (); 3095 return true; 3096 } 3097 3098 /* Function vect_vfa_segment_size. 3099 3100 Input: 3101 DR: The data reference. 3102 LENGTH_FACTOR: segment length to consider. 3103 3104 Return a value suitable for the dr_with_seg_len::seg_len field. 3105 This is the "distance travelled" by the pointer from the first 3106 iteration in the segment to the last. Note that it does not include 3107 the size of the access; in effect it only describes the first byte. */ 3108 3109 static tree 3110 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3111 { 3112 length_factor = size_binop (MINUS_EXPR, 3113 fold_convert (sizetype, length_factor), 3114 size_one_node); 3115 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)), 3116 length_factor); 3117 } 3118 3119 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)), 3120 gives the worst-case number of bytes covered by the segment. */ 3121 3122 static unsigned HOST_WIDE_INT 3123 vect_vfa_access_size (data_reference *dr) 3124 { 3125 stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr)); 3126 tree ref_type = TREE_TYPE (DR_REF (dr)); 3127 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3128 unsigned HOST_WIDE_INT access_size = ref_size; 3129 if (GROUP_FIRST_ELEMENT (stmt_vinfo)) 3130 { 3131 gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr)); 3132 access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo); 3133 } 3134 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3135 && (vect_supportable_dr_alignment (dr, false) 3136 == dr_explicit_realign_optimized)) 3137 { 3138 /* We might access a full vector's worth. */ 3139 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3140 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3141 } 3142 return access_size; 3143 } 3144 3145 /* Get the minimum alignment for all the scalar accesses that DR describes. */ 3146 3147 static unsigned int 3148 vect_vfa_align (const data_reference *dr) 3149 { 3150 return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr))); 3151 } 3152 3153 /* Function vect_no_alias_p. 3154 3155 Given data references A and B with equal base and offset, see whether 3156 the alias relation can be decided at compilation time. Return 1 if 3157 it can and the references alias, 0 if it can and the references do 3158 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3159 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3160 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3161 3162 static int 3163 vect_compile_time_alias (struct data_reference *a, struct data_reference *b, 3164 tree segment_length_a, tree segment_length_b, 3165 unsigned HOST_WIDE_INT access_size_a, 3166 unsigned HOST_WIDE_INT access_size_b) 3167 { 3168 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a)); 3169 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b)); 3170 poly_uint64 const_length_a; 3171 poly_uint64 const_length_b; 3172 3173 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3174 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3175 [a, a+12) */ 3176 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3177 { 3178 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3179 offset_a = (offset_a + access_size_a) - const_length_a; 3180 } 3181 else 3182 const_length_a = tree_to_poly_uint64 (segment_length_a); 3183 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3184 { 3185 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3186 offset_b = (offset_b + access_size_b) - const_length_b; 3187 } 3188 else 3189 const_length_b = tree_to_poly_uint64 (segment_length_b); 3190 3191 const_length_a += access_size_a; 3192 const_length_b += access_size_b; 3193 3194 if (ranges_known_overlap_p (offset_a, const_length_a, 3195 offset_b, const_length_b)) 3196 return 1; 3197 3198 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3199 offset_b, const_length_b)) 3200 return 0; 3201 3202 return -1; 3203 } 3204 3205 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3206 in DDR is >= VF. */ 3207 3208 static bool 3209 dependence_distance_ge_vf (data_dependence_relation *ddr, 3210 unsigned int loop_depth, poly_uint64 vf) 3211 { 3212 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3213 || DDR_NUM_DIST_VECTS (ddr) == 0) 3214 return false; 3215 3216 /* If the dependence is exact, we should have limited the VF instead. */ 3217 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3218 3219 unsigned int i; 3220 lambda_vector dist_v; 3221 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3222 { 3223 HOST_WIDE_INT dist = dist_v[loop_depth]; 3224 if (dist != 0 3225 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3226 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3227 return false; 3228 } 3229 3230 if (dump_enabled_p ()) 3231 { 3232 dump_printf_loc (MSG_NOTE, vect_location, 3233 "dependence distance between "); 3234 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 3235 dump_printf (MSG_NOTE, " and "); 3236 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 3237 dump_printf (MSG_NOTE, " is >= VF\n"); 3238 } 3239 3240 return true; 3241 } 3242 3243 /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3244 3245 static void 3246 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound) 3247 { 3248 dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs"); 3249 dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr); 3250 dump_printf (dump_kind, ") >= "); 3251 dump_dec (dump_kind, lower_bound.min_value); 3252 } 3253 3254 /* Record that the vectorized loop requires the vec_lower_bound described 3255 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3256 3257 static void 3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3259 poly_uint64 min_value) 3260 { 3261 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3262 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3263 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3264 { 3265 unsigned_p &= lower_bounds[i].unsigned_p; 3266 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3267 if (lower_bounds[i].unsigned_p != unsigned_p 3268 || maybe_lt (lower_bounds[i].min_value, min_value)) 3269 { 3270 lower_bounds[i].unsigned_p = unsigned_p; 3271 lower_bounds[i].min_value = min_value; 3272 if (dump_enabled_p ()) 3273 { 3274 dump_printf_loc (MSG_NOTE, vect_location, 3275 "updating run-time check to "); 3276 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3277 dump_printf (MSG_NOTE, "\n"); 3278 } 3279 } 3280 return; 3281 } 3282 3283 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3284 if (dump_enabled_p ()) 3285 { 3286 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3287 dump_lower_bound (MSG_NOTE, lower_bound); 3288 dump_printf (MSG_NOTE, "\n"); 3289 } 3290 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3291 } 3292 3293 /* Return true if it's unlikely that the step of the vectorized form of DR 3294 will span fewer than GAP bytes. */ 3295 3296 static bool 3297 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap) 3298 { 3299 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 3300 HOST_WIDE_INT count 3301 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3302 if (GROUP_FIRST_ELEMENT (stmt_info)) 3303 count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 3304 return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr); 3305 } 3306 3307 /* Return true if we know that there is no alias between DR_A and DR_B 3308 when abs (DR_STEP (DR_A)) >= N for some N. When returning true, set 3309 *LOWER_BOUND_OUT to this N. */ 3310 3311 static bool 3312 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b, 3313 poly_uint64 *lower_bound_out) 3314 { 3315 /* Check that there is a constant gap of known sign between DR_A 3316 and DR_B. */ 3317 poly_int64 init_a, init_b; 3318 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3319 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3320 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3321 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3322 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3323 || !ordered_p (init_a, init_b)) 3324 return false; 3325 3326 /* Sort DR_A and DR_B by the address they access. */ 3327 if (maybe_lt (init_b, init_a)) 3328 { 3329 std::swap (init_a, init_b); 3330 std::swap (dr_a, dr_b); 3331 } 3332 3333 /* If the two accesses could be dependent within a scalar iteration, 3334 make sure that we'd retain their order. */ 3335 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b) 3336 && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b))) 3337 return false; 3338 3339 /* There is no alias if abs (DR_STEP) is greater than or equal to 3340 the bytes spanned by the combination of the two accesses. */ 3341 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a; 3342 return true; 3343 } 3344 3345 /* Function vect_prune_runtime_alias_test_list. 3346 3347 Prune a list of ddrs to be tested at run-time by versioning for alias. 3348 Merge several alias checks into one if possible. 3349 Return FALSE if resulting list of ddrs is longer then allowed by 3350 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3351 3352 bool 3353 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3354 { 3355 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3356 hash_set <tree_pair_hash> compared_objects; 3357 3358 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3359 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3360 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3361 vec<vec_object_pair> &check_unequal_addrs 3362 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3363 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3364 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3365 3366 ddr_p ddr; 3367 unsigned int i; 3368 tree length_factor; 3369 3370 if (dump_enabled_p ()) 3371 dump_printf_loc (MSG_NOTE, vect_location, 3372 "=== vect_prune_runtime_alias_test_list ===\n"); 3373 3374 /* Step values are irrelevant for aliasing if the number of vector 3375 iterations is equal to the number of scalar iterations (which can 3376 happen for fully-SLP loops). */ 3377 bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3378 3379 if (!ignore_step_p) 3380 { 3381 /* Convert the checks for nonzero steps into bound tests. */ 3382 tree value; 3383 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3384 vect_check_lower_bound (loop_vinfo, value, true, 1); 3385 } 3386 3387 if (may_alias_ddrs.is_empty ()) 3388 return true; 3389 3390 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3391 3392 unsigned int loop_depth 3393 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3394 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3395 3396 /* First, we collect all data ref pairs for aliasing checks. */ 3397 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3398 { 3399 int comp_res; 3400 poly_uint64 lower_bound; 3401 struct data_reference *dr_a, *dr_b; 3402 gimple *dr_group_first_a, *dr_group_first_b; 3403 tree segment_length_a, segment_length_b; 3404 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3405 unsigned int align_a, align_b; 3406 gimple *stmt_a, *stmt_b; 3407 3408 /* Ignore the alias if the VF we chose ended up being no greater 3409 than the dependence distance. */ 3410 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3411 continue; 3412 3413 if (DDR_OBJECT_A (ddr)) 3414 { 3415 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3416 if (!compared_objects.add (new_pair)) 3417 { 3418 if (dump_enabled_p ()) 3419 { 3420 dump_printf_loc (MSG_NOTE, vect_location, "checking that "); 3421 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first); 3422 dump_printf (MSG_NOTE, " and "); 3423 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second); 3424 dump_printf (MSG_NOTE, " have different addresses\n"); 3425 } 3426 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3427 } 3428 continue; 3429 } 3430 3431 dr_a = DDR_A (ddr); 3432 stmt_a = DR_STMT (DDR_A (ddr)); 3433 3434 dr_b = DDR_B (ddr); 3435 stmt_b = DR_STMT (DDR_B (ddr)); 3436 3437 /* Skip the pair if inter-iteration dependencies are irrelevant 3438 and intra-iteration dependencies are guaranteed to be honored. */ 3439 if (ignore_step_p 3440 && (vect_preserves_scalar_order_p (stmt_a, stmt_b) 3441 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound))) 3442 { 3443 if (dump_enabled_p ()) 3444 { 3445 dump_printf_loc (MSG_NOTE, vect_location, 3446 "no need for alias check between "); 3447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3448 dump_printf (MSG_NOTE, " and "); 3449 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3450 dump_printf (MSG_NOTE, " when VF is 1\n"); 3451 } 3452 continue; 3453 } 3454 3455 /* See whether we can handle the alias using a bounds check on 3456 the step, and whether that's likely to be the best approach. 3457 (It might not be, for example, if the minimum step is much larger 3458 than the number of bytes handled by one vector iteration.) */ 3459 if (!ignore_step_p 3460 && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST 3461 && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound) 3462 && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound) 3463 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound))) 3464 { 3465 bool unsigned_p = dr_known_forward_stride_p (dr_a); 3466 if (dump_enabled_p ()) 3467 { 3468 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "); 3469 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3470 dump_printf (MSG_NOTE, " and "); 3471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3472 dump_printf (MSG_NOTE, " when the step "); 3473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a)); 3474 dump_printf (MSG_NOTE, " is outside "); 3475 if (unsigned_p) 3476 dump_printf (MSG_NOTE, "[0"); 3477 else 3478 { 3479 dump_printf (MSG_NOTE, "("); 3480 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3481 } 3482 dump_printf (MSG_NOTE, ", "); 3483 dump_dec (MSG_NOTE, lower_bound); 3484 dump_printf (MSG_NOTE, ")\n"); 3485 } 3486 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p, 3487 lower_bound); 3488 continue; 3489 } 3490 3491 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3492 if (dr_group_first_a) 3493 { 3494 stmt_a = dr_group_first_a; 3495 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3496 } 3497 3498 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3499 if (dr_group_first_b) 3500 { 3501 stmt_b = dr_group_first_b; 3502 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3503 } 3504 3505 if (ignore_step_p) 3506 { 3507 segment_length_a = size_zero_node; 3508 segment_length_b = size_zero_node; 3509 } 3510 else 3511 { 3512 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3513 length_factor = scalar_loop_iters; 3514 else 3515 length_factor = size_int (vect_factor); 3516 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3517 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3518 } 3519 access_size_a = vect_vfa_access_size (dr_a); 3520 access_size_b = vect_vfa_access_size (dr_b); 3521 align_a = vect_vfa_align (dr_a); 3522 align_b = vect_vfa_align (dr_b); 3523 3524 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 3525 DR_BASE_ADDRESS (dr_b)); 3526 if (comp_res == 0) 3527 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), 3528 DR_OFFSET (dr_b)); 3529 3530 /* See whether the alias is known at compilation time. */ 3531 if (comp_res == 0 3532 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3533 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3534 && poly_int_tree_p (segment_length_a) 3535 && poly_int_tree_p (segment_length_b)) 3536 { 3537 int res = vect_compile_time_alias (dr_a, dr_b, 3538 segment_length_a, 3539 segment_length_b, 3540 access_size_a, 3541 access_size_b); 3542 if (res >= 0 && dump_enabled_p ()) 3543 { 3544 dump_printf_loc (MSG_NOTE, vect_location, 3545 "can tell at compile time that "); 3546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a)); 3547 dump_printf (MSG_NOTE, " and "); 3548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b)); 3549 if (res == 0) 3550 dump_printf (MSG_NOTE, " do not alias\n"); 3551 else 3552 dump_printf (MSG_NOTE, " alias\n"); 3553 } 3554 3555 if (res == 0) 3556 continue; 3557 3558 if (res == 1) 3559 { 3560 if (dump_enabled_p ()) 3561 dump_printf_loc (MSG_NOTE, vect_location, 3562 "not vectorized: compilation time alias.\n"); 3563 return false; 3564 } 3565 } 3566 3567 dr_with_seg_len_pair_t dr_with_seg_len_pair 3568 (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a), 3569 dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b)); 3570 3571 /* Canonicalize pairs by sorting the two DR members. */ 3572 if (comp_res > 0) 3573 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3574 3575 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3576 } 3577 3578 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3579 3580 unsigned int count = (comp_alias_ddrs.length () 3581 + check_unequal_addrs.length ()); 3582 3583 dump_printf_loc (MSG_NOTE, vect_location, 3584 "improved number of alias checks from %d to %d\n", 3585 may_alias_ddrs.length (), count); 3586 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3587 { 3588 if (dump_enabled_p ()) 3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3590 "number of versioning for alias " 3591 "run-time tests exceeds %d " 3592 "(--param vect-max-version-for-alias-checks)\n", 3593 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3594 return false; 3595 } 3596 3597 return true; 3598 } 3599 3600 /* Check whether we can use an internal function for a gather load 3601 or scatter store. READ_P is true for loads and false for stores. 3602 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3603 the type of the memory elements being loaded or stored. OFFSET_BITS 3604 is the number of bits in each scalar offset and OFFSET_SIGN is the 3605 sign of the offset. SCALE is the amount by which the offset should 3606 be multiplied *after* it has been converted to address width. 3607 3608 Return true if the function is supported, storing the function 3609 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ 3610 3611 bool 3612 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, 3613 tree memory_type, unsigned int offset_bits, 3614 signop offset_sign, int scale, 3615 internal_fn *ifn_out, tree *element_type_out) 3616 { 3617 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3618 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3619 if (offset_bits > element_bits) 3620 /* Internal functions require the offset to be the same width as 3621 the vector elements. We can extend narrower offsets, but it isn't 3622 safe to truncate wider offsets. */ 3623 return false; 3624 3625 if (element_bits != memory_bits) 3626 /* For now the vector elements must be the same width as the 3627 memory elements. */ 3628 return false; 3629 3630 /* Work out which function we need. */ 3631 internal_fn ifn; 3632 if (read_p) 3633 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3634 else 3635 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3636 3637 /* Test whether the target supports this combination. */ 3638 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3639 offset_sign, scale)) 3640 return false; 3641 3642 *ifn_out = ifn; 3643 *element_type_out = TREE_TYPE (vectype); 3644 return true; 3645 } 3646 3647 /* CALL is a call to an internal gather load or scatter store function. 3648 Describe the operation in INFO. */ 3649 3650 static void 3651 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info) 3652 { 3653 stmt_vec_info stmt_info = vinfo_for_stmt (call); 3654 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3655 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3656 3657 info->ifn = gimple_call_internal_fn (call); 3658 info->decl = NULL_TREE; 3659 info->base = gimple_call_arg (call, 0); 3660 info->offset = gimple_call_arg (call, 1); 3661 info->offset_dt = vect_unknown_def_type; 3662 info->offset_vectype = NULL_TREE; 3663 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3664 info->element_type = TREE_TYPE (vectype); 3665 info->memory_type = TREE_TYPE (DR_REF (dr)); 3666 } 3667 3668 /* Return true if a non-affine read or write in STMT is suitable for a 3669 gather load or scatter store. Describe the operation in *INFO if so. */ 3670 3671 bool 3672 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3673 gather_scatter_info *info) 3674 { 3675 HOST_WIDE_INT scale = 1; 3676 poly_int64 pbitpos, pbitsize; 3677 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3678 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3679 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3680 tree offtype = NULL_TREE; 3681 tree decl = NULL_TREE, base, off; 3682 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3683 tree memory_type = TREE_TYPE (DR_REF (dr)); 3684 machine_mode pmode; 3685 int punsignedp, reversep, pvolatilep = 0; 3686 internal_fn ifn; 3687 tree element_type; 3688 bool masked_p = false; 3689 3690 /* See whether this is already a call to a gather/scatter internal function. 3691 If not, see whether it's a masked load or store. */ 3692 gcall *call = dyn_cast <gcall *> (stmt); 3693 if (call && gimple_call_internal_p (call)) 3694 { 3695 ifn = gimple_call_internal_fn (stmt); 3696 if (internal_gather_scatter_fn_p (ifn)) 3697 { 3698 vect_describe_gather_scatter_call (call, info); 3699 return true; 3700 } 3701 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3702 } 3703 3704 /* True if we should aim to use internal functions rather than 3705 built-in functions. */ 3706 bool use_ifn_p = (DR_IS_READ (dr) 3707 ? supports_vec_gather_load_p () 3708 : supports_vec_scatter_store_p ()); 3709 3710 base = DR_REF (dr); 3711 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3712 see if we can use the def stmt of the address. */ 3713 if (masked_p 3714 && TREE_CODE (base) == MEM_REF 3715 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3716 && integer_zerop (TREE_OPERAND (base, 1)) 3717 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3718 { 3719 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3720 if (is_gimple_assign (def_stmt) 3721 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3722 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3723 } 3724 3725 /* The gather and scatter builtins need address of the form 3726 loop_invariant + vector * {1, 2, 4, 8} 3727 or 3728 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3729 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3730 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3731 multiplications and additions in it. To get a vector, we need 3732 a single SSA_NAME that will be defined in the loop and will 3733 contain everything that is not loop invariant and that can be 3734 vectorized. The following code attempts to find such a preexistng 3735 SSA_NAME OFF and put the loop invariants into a tree BASE 3736 that can be gimplified before the loop. */ 3737 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3738 &punsignedp, &reversep, &pvolatilep); 3739 gcc_assert (base && !reversep); 3740 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3741 3742 if (TREE_CODE (base) == MEM_REF) 3743 { 3744 if (!integer_zerop (TREE_OPERAND (base, 1))) 3745 { 3746 if (off == NULL_TREE) 3747 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3748 else 3749 off = size_binop (PLUS_EXPR, off, 3750 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3751 } 3752 base = TREE_OPERAND (base, 0); 3753 } 3754 else 3755 base = build_fold_addr_expr (base); 3756 3757 if (off == NULL_TREE) 3758 off = size_zero_node; 3759 3760 /* If base is not loop invariant, either off is 0, then we start with just 3761 the constant offset in the loop invariant BASE and continue with base 3762 as OFF, otherwise give up. 3763 We could handle that case by gimplifying the addition of base + off 3764 into some SSA_NAME and use that as off, but for now punt. */ 3765 if (!expr_invariant_in_loop_p (loop, base)) 3766 { 3767 if (!integer_zerop (off)) 3768 return false; 3769 off = base; 3770 base = size_int (pbytepos); 3771 } 3772 /* Otherwise put base + constant offset into the loop invariant BASE 3773 and continue with OFF. */ 3774 else 3775 { 3776 base = fold_convert (sizetype, base); 3777 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3778 } 3779 3780 /* OFF at this point may be either a SSA_NAME or some tree expression 3781 from get_inner_reference. Try to peel off loop invariants from it 3782 into BASE as long as possible. */ 3783 STRIP_NOPS (off); 3784 while (offtype == NULL_TREE) 3785 { 3786 enum tree_code code; 3787 tree op0, op1, add = NULL_TREE; 3788 3789 if (TREE_CODE (off) == SSA_NAME) 3790 { 3791 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3792 3793 if (expr_invariant_in_loop_p (loop, off)) 3794 return false; 3795 3796 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3797 break; 3798 3799 op0 = gimple_assign_rhs1 (def_stmt); 3800 code = gimple_assign_rhs_code (def_stmt); 3801 op1 = gimple_assign_rhs2 (def_stmt); 3802 } 3803 else 3804 { 3805 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3806 return false; 3807 code = TREE_CODE (off); 3808 extract_ops_from_tree (off, &code, &op0, &op1); 3809 } 3810 switch (code) 3811 { 3812 case POINTER_PLUS_EXPR: 3813 case PLUS_EXPR: 3814 if (expr_invariant_in_loop_p (loop, op0)) 3815 { 3816 add = op0; 3817 off = op1; 3818 do_add: 3819 add = fold_convert (sizetype, add); 3820 if (scale != 1) 3821 add = size_binop (MULT_EXPR, add, size_int (scale)); 3822 base = size_binop (PLUS_EXPR, base, add); 3823 continue; 3824 } 3825 if (expr_invariant_in_loop_p (loop, op1)) 3826 { 3827 add = op1; 3828 off = op0; 3829 goto do_add; 3830 } 3831 break; 3832 case MINUS_EXPR: 3833 if (expr_invariant_in_loop_p (loop, op1)) 3834 { 3835 add = fold_convert (sizetype, op1); 3836 add = size_binop (MINUS_EXPR, size_zero_node, add); 3837 off = op0; 3838 goto do_add; 3839 } 3840 break; 3841 case MULT_EXPR: 3842 if (scale == 1 && tree_fits_shwi_p (op1)) 3843 { 3844 int new_scale = tree_to_shwi (op1); 3845 /* Only treat this as a scaling operation if the target 3846 supports it. */ 3847 if (use_ifn_p 3848 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, 3849 vectype, memory_type, 1, 3850 TYPE_SIGN (TREE_TYPE (op0)), 3851 new_scale, &ifn, 3852 &element_type)) 3853 break; 3854 scale = new_scale; 3855 off = op0; 3856 continue; 3857 } 3858 break; 3859 case SSA_NAME: 3860 off = op0; 3861 continue; 3862 CASE_CONVERT: 3863 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3864 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3865 break; 3866 if (TYPE_PRECISION (TREE_TYPE (op0)) 3867 == TYPE_PRECISION (TREE_TYPE (off))) 3868 { 3869 off = op0; 3870 continue; 3871 } 3872 3873 /* The internal functions need the offset to be the same width 3874 as the elements of VECTYPE. Don't include operations that 3875 cast the offset from that width to a different width. */ 3876 if (use_ifn_p 3877 && (int_size_in_bytes (TREE_TYPE (vectype)) 3878 == int_size_in_bytes (TREE_TYPE (off)))) 3879 break; 3880 3881 if (TYPE_PRECISION (TREE_TYPE (op0)) 3882 < TYPE_PRECISION (TREE_TYPE (off))) 3883 { 3884 off = op0; 3885 offtype = TREE_TYPE (off); 3886 STRIP_NOPS (off); 3887 continue; 3888 } 3889 break; 3890 default: 3891 break; 3892 } 3893 break; 3894 } 3895 3896 /* If at the end OFF still isn't a SSA_NAME or isn't 3897 defined in the loop, punt. */ 3898 if (TREE_CODE (off) != SSA_NAME 3899 || expr_invariant_in_loop_p (loop, off)) 3900 return false; 3901 3902 if (offtype == NULL_TREE) 3903 offtype = TREE_TYPE (off); 3904 3905 if (use_ifn_p) 3906 { 3907 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, 3908 memory_type, TYPE_PRECISION (offtype), 3909 TYPE_SIGN (offtype), scale, &ifn, 3910 &element_type)) 3911 return false; 3912 } 3913 else 3914 { 3915 if (DR_IS_READ (dr)) 3916 { 3917 if (targetm.vectorize.builtin_gather) 3918 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 3919 } 3920 else 3921 { 3922 if (targetm.vectorize.builtin_scatter) 3923 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 3924 } 3925 3926 if (!decl) 3927 return false; 3928 3929 ifn = IFN_LAST; 3930 element_type = TREE_TYPE (vectype); 3931 } 3932 3933 info->ifn = ifn; 3934 info->decl = decl; 3935 info->base = base; 3936 info->offset = off; 3937 info->offset_dt = vect_unknown_def_type; 3938 info->offset_vectype = NULL_TREE; 3939 info->scale = scale; 3940 info->element_type = element_type; 3941 info->memory_type = memory_type; 3942 return true; 3943 } 3944 3945 /* Function vect_analyze_data_refs. 3946 3947 Find all the data references in the loop or basic block. 3948 3949 The general structure of the analysis of data refs in the vectorizer is as 3950 follows: 3951 1- vect_analyze_data_refs(loop/bb): call 3952 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3953 in the loop/bb and their dependences. 3954 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3955 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3956 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3957 3958 */ 3959 3960 bool 3961 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf) 3962 { 3963 struct loop *loop = NULL; 3964 unsigned int i; 3965 struct data_reference *dr; 3966 tree scalar_type; 3967 3968 if (dump_enabled_p ()) 3969 dump_printf_loc (MSG_NOTE, vect_location, 3970 "=== vect_analyze_data_refs ===\n"); 3971 3972 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 3973 loop = LOOP_VINFO_LOOP (loop_vinfo); 3974 3975 /* Go through the data-refs, check that the analysis succeeded. Update 3976 pointer from stmt_vec_info struct to DR and vectype. */ 3977 3978 vec<data_reference_p> datarefs = vinfo->datarefs; 3979 FOR_EACH_VEC_ELT (datarefs, i, dr) 3980 { 3981 gimple *stmt; 3982 stmt_vec_info stmt_info; 3983 tree base, offset, init; 3984 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 3985 bool simd_lane_access = false; 3986 poly_uint64 vf; 3987 3988 again: 3989 if (!dr || !DR_REF (dr)) 3990 { 3991 if (dump_enabled_p ()) 3992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3993 "not vectorized: unhandled data-ref\n"); 3994 return false; 3995 } 3996 3997 stmt = DR_STMT (dr); 3998 stmt_info = vinfo_for_stmt (stmt); 3999 4000 /* Discard clobbers from the dataref vector. We will remove 4001 clobber stmts during vectorization. */ 4002 if (gimple_clobber_p (stmt)) 4003 { 4004 free_data_ref (dr); 4005 if (i == datarefs.length () - 1) 4006 { 4007 datarefs.pop (); 4008 break; 4009 } 4010 datarefs.ordered_remove (i); 4011 dr = datarefs[i]; 4012 goto again; 4013 } 4014 4015 /* Check that analysis of the data-ref succeeded. */ 4016 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4017 || !DR_STEP (dr)) 4018 { 4019 bool maybe_gather 4020 = DR_IS_READ (dr) 4021 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4022 && (targetm.vectorize.builtin_gather != NULL 4023 || supports_vec_gather_load_p ()); 4024 bool maybe_scatter 4025 = DR_IS_WRITE (dr) 4026 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4027 && (targetm.vectorize.builtin_scatter != NULL 4028 || supports_vec_scatter_store_p ()); 4029 bool maybe_simd_lane_access 4030 = is_a <loop_vec_info> (vinfo) && loop->simduid; 4031 4032 /* If target supports vector gather loads or scatter stores, or if 4033 this might be a SIMD lane access, see if they can't be used. */ 4034 if (is_a <loop_vec_info> (vinfo) 4035 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 4036 && !nested_in_vect_loop_p (loop, stmt)) 4037 { 4038 struct data_reference *newdr 4039 = create_data_ref (NULL, loop_containing_stmt (stmt), 4040 DR_REF (dr), stmt, !maybe_scatter, 4041 DR_IS_CONDITIONAL_IN_STMT (dr)); 4042 gcc_assert (newdr != NULL && DR_REF (newdr)); 4043 if (DR_BASE_ADDRESS (newdr) 4044 && DR_OFFSET (newdr) 4045 && DR_INIT (newdr) 4046 && DR_STEP (newdr) 4047 && integer_zerop (DR_STEP (newdr))) 4048 { 4049 if (maybe_simd_lane_access) 4050 { 4051 tree off = DR_OFFSET (newdr); 4052 STRIP_NOPS (off); 4053 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4054 && TREE_CODE (off) == MULT_EXPR 4055 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4056 { 4057 tree step = TREE_OPERAND (off, 1); 4058 off = TREE_OPERAND (off, 0); 4059 STRIP_NOPS (off); 4060 if (CONVERT_EXPR_P (off) 4061 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 4062 0))) 4063 < TYPE_PRECISION (TREE_TYPE (off))) 4064 off = TREE_OPERAND (off, 0); 4065 if (TREE_CODE (off) == SSA_NAME) 4066 { 4067 gimple *def = SSA_NAME_DEF_STMT (off); 4068 tree reft = TREE_TYPE (DR_REF (newdr)); 4069 if (is_gimple_call (def) 4070 && gimple_call_internal_p (def) 4071 && (gimple_call_internal_fn (def) 4072 == IFN_GOMP_SIMD_LANE)) 4073 { 4074 tree arg = gimple_call_arg (def, 0); 4075 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4076 arg = SSA_NAME_VAR (arg); 4077 if (arg == loop->simduid 4078 /* For now. */ 4079 && tree_int_cst_equal 4080 (TYPE_SIZE_UNIT (reft), 4081 step)) 4082 { 4083 DR_OFFSET (newdr) = ssize_int (0); 4084 DR_STEP (newdr) = step; 4085 DR_OFFSET_ALIGNMENT (newdr) 4086 = BIGGEST_ALIGNMENT; 4087 DR_STEP_ALIGNMENT (newdr) 4088 = highest_pow2_factor (step); 4089 dr = newdr; 4090 simd_lane_access = true; 4091 } 4092 } 4093 } 4094 } 4095 } 4096 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 4097 { 4098 dr = newdr; 4099 if (maybe_gather) 4100 gatherscatter = GATHER; 4101 else 4102 gatherscatter = SCATTER; 4103 } 4104 } 4105 if (gatherscatter == SG_NONE && !simd_lane_access) 4106 free_data_ref (newdr); 4107 } 4108 4109 if (gatherscatter == SG_NONE && !simd_lane_access) 4110 { 4111 if (dump_enabled_p ()) 4112 { 4113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4114 "not vectorized: data ref analysis " 4115 "failed "); 4116 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4117 } 4118 4119 if (is_a <bb_vec_info> (vinfo)) 4120 break; 4121 4122 return false; 4123 } 4124 } 4125 4126 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4127 { 4128 if (dump_enabled_p ()) 4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4130 "not vectorized: base addr of dr is a " 4131 "constant\n"); 4132 4133 if (is_a <bb_vec_info> (vinfo)) 4134 break; 4135 4136 if (gatherscatter != SG_NONE || simd_lane_access) 4137 free_data_ref (dr); 4138 return false; 4139 } 4140 4141 if (TREE_THIS_VOLATILE (DR_REF (dr))) 4142 { 4143 if (dump_enabled_p ()) 4144 { 4145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4146 "not vectorized: volatile type "); 4147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4148 } 4149 4150 if (is_a <bb_vec_info> (vinfo)) 4151 break; 4152 4153 return false; 4154 } 4155 4156 if (stmt_can_throw_internal (stmt)) 4157 { 4158 if (dump_enabled_p ()) 4159 { 4160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4161 "not vectorized: statement can throw an " 4162 "exception "); 4163 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4164 } 4165 4166 if (is_a <bb_vec_info> (vinfo)) 4167 break; 4168 4169 if (gatherscatter != SG_NONE || simd_lane_access) 4170 free_data_ref (dr); 4171 return false; 4172 } 4173 4174 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4175 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4176 { 4177 if (dump_enabled_p ()) 4178 { 4179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4180 "not vectorized: statement is bitfield " 4181 "access "); 4182 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4183 } 4184 4185 if (is_a <bb_vec_info> (vinfo)) 4186 break; 4187 4188 if (gatherscatter != SG_NONE || simd_lane_access) 4189 free_data_ref (dr); 4190 return false; 4191 } 4192 4193 base = unshare_expr (DR_BASE_ADDRESS (dr)); 4194 offset = unshare_expr (DR_OFFSET (dr)); 4195 init = unshare_expr (DR_INIT (dr)); 4196 4197 if (is_gimple_call (stmt) 4198 && (!gimple_call_internal_p (stmt) 4199 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 4200 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 4201 { 4202 if (dump_enabled_p ()) 4203 { 4204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4205 "not vectorized: dr in a call "); 4206 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4207 } 4208 4209 if (is_a <bb_vec_info> (vinfo)) 4210 break; 4211 4212 if (gatherscatter != SG_NONE || simd_lane_access) 4213 free_data_ref (dr); 4214 return false; 4215 } 4216 4217 /* Update DR field in stmt_vec_info struct. */ 4218 4219 /* If the dataref is in an inner-loop of the loop that is considered for 4220 for vectorization, we also want to analyze the access relative to 4221 the outer-loop (DR contains information only relative to the 4222 inner-most enclosing loop). We do that by building a reference to the 4223 first location accessed by the inner-loop, and analyze it relative to 4224 the outer-loop. */ 4225 if (loop && nested_in_vect_loop_p (loop, stmt)) 4226 { 4227 /* Build a reference to the first location accessed by the 4228 inner loop: *(BASE + INIT + OFFSET). By construction, 4229 this address must be invariant in the inner loop, so we 4230 can consider it as being used in the outer loop. */ 4231 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4232 init, offset); 4233 tree init_addr = fold_build_pointer_plus (base, init_offset); 4234 tree init_ref = build_fold_indirect_ref (init_addr); 4235 4236 if (dump_enabled_p ()) 4237 { 4238 dump_printf_loc (MSG_NOTE, vect_location, 4239 "analyze in outer loop: "); 4240 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref); 4241 dump_printf (MSG_NOTE, "\n"); 4242 } 4243 4244 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4245 init_ref, loop)) 4246 /* dr_analyze_innermost already explained the failure. */ 4247 return false; 4248 4249 if (dump_enabled_p ()) 4250 { 4251 dump_printf_loc (MSG_NOTE, vect_location, 4252 "\touter base_address: "); 4253 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4254 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4255 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 4256 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4257 STMT_VINFO_DR_OFFSET (stmt_info)); 4258 dump_printf (MSG_NOTE, 4259 "\n\touter constant offset from base address: "); 4260 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4261 STMT_VINFO_DR_INIT (stmt_info)); 4262 dump_printf (MSG_NOTE, "\n\touter step: "); 4263 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4264 STMT_VINFO_DR_STEP (stmt_info)); 4265 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n", 4266 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info)); 4267 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n", 4268 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info)); 4269 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n", 4270 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info)); 4271 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n", 4272 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4273 } 4274 } 4275 4276 if (STMT_VINFO_DATA_REF (stmt_info)) 4277 { 4278 if (dump_enabled_p ()) 4279 { 4280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4281 "not vectorized: more than one data ref " 4282 "in stmt: "); 4283 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4284 } 4285 4286 if (is_a <bb_vec_info> (vinfo)) 4287 break; 4288 4289 if (gatherscatter != SG_NONE || simd_lane_access) 4290 free_data_ref (dr); 4291 return false; 4292 } 4293 4294 STMT_VINFO_DATA_REF (stmt_info) = dr; 4295 if (simd_lane_access) 4296 { 4297 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4298 free_data_ref (datarefs[i]); 4299 datarefs[i] = dr; 4300 } 4301 4302 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4303 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4304 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4305 { 4306 if (dump_enabled_p ()) 4307 { 4308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4309 "not vectorized: base object not addressable " 4310 "for stmt: "); 4311 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4312 } 4313 if (is_a <bb_vec_info> (vinfo)) 4314 { 4315 /* In BB vectorization the ref can still participate 4316 in dependence analysis, we just can't vectorize it. */ 4317 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4318 continue; 4319 } 4320 return false; 4321 } 4322 4323 /* Set vectype for STMT. */ 4324 scalar_type = TREE_TYPE (DR_REF (dr)); 4325 STMT_VINFO_VECTYPE (stmt_info) 4326 = get_vectype_for_scalar_type (scalar_type); 4327 if (!STMT_VINFO_VECTYPE (stmt_info)) 4328 { 4329 if (dump_enabled_p ()) 4330 { 4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4332 "not vectorized: no vectype for stmt: "); 4333 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4334 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4336 scalar_type); 4337 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4338 } 4339 4340 if (is_a <bb_vec_info> (vinfo)) 4341 { 4342 /* No vector type is fine, the ref can still participate 4343 in dependence analysis, we just can't vectorize it. */ 4344 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4345 continue; 4346 } 4347 4348 if (gatherscatter != SG_NONE || simd_lane_access) 4349 { 4350 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4351 if (gatherscatter != SG_NONE) 4352 free_data_ref (dr); 4353 } 4354 return false; 4355 } 4356 else 4357 { 4358 if (dump_enabled_p ()) 4359 { 4360 dump_printf_loc (MSG_NOTE, vect_location, 4361 "got vectype for stmt: "); 4362 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4363 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4364 STMT_VINFO_VECTYPE (stmt_info)); 4365 dump_printf (MSG_NOTE, "\n"); 4366 } 4367 } 4368 4369 /* Adjust the minimal vectorization factor according to the 4370 vector type. */ 4371 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4372 *min_vf = upper_bound (*min_vf, vf); 4373 4374 if (gatherscatter != SG_NONE) 4375 { 4376 gather_scatter_info gs_info; 4377 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4378 &gs_info) 4379 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4380 { 4381 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4382 free_data_ref (dr); 4383 if (dump_enabled_p ()) 4384 { 4385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4386 (gatherscatter == GATHER) ? 4387 "not vectorized: not suitable for gather " 4388 "load " : 4389 "not vectorized: not suitable for scatter " 4390 "store "); 4391 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4392 } 4393 return false; 4394 } 4395 4396 free_data_ref (datarefs[i]); 4397 datarefs[i] = dr; 4398 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4399 } 4400 4401 else if (is_a <loop_vec_info> (vinfo) 4402 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4403 { 4404 if (nested_in_vect_loop_p (loop, stmt)) 4405 { 4406 if (dump_enabled_p ()) 4407 { 4408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4409 "not vectorized: not suitable for strided " 4410 "load "); 4411 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4412 } 4413 return false; 4414 } 4415 STMT_VINFO_STRIDED_P (stmt_info) = true; 4416 } 4417 } 4418 4419 /* If we stopped analysis at the first dataref we could not analyze 4420 when trying to vectorize a basic-block mark the rest of the datarefs 4421 as not vectorizable and truncate the vector of datarefs. That 4422 avoids spending useless time in analyzing their dependence. */ 4423 if (i != datarefs.length ()) 4424 { 4425 gcc_assert (is_a <bb_vec_info> (vinfo)); 4426 for (unsigned j = i; j < datarefs.length (); ++j) 4427 { 4428 data_reference_p dr = datarefs[j]; 4429 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4430 free_data_ref (dr); 4431 } 4432 datarefs.truncate (i); 4433 } 4434 4435 return true; 4436 } 4437 4438 4439 /* Function vect_get_new_vect_var. 4440 4441 Returns a name for a new variable. The current naming scheme appends the 4442 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4443 the name of vectorizer generated variables, and appends that to NAME if 4444 provided. */ 4445 4446 tree 4447 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4448 { 4449 const char *prefix; 4450 tree new_vect_var; 4451 4452 switch (var_kind) 4453 { 4454 case vect_simple_var: 4455 prefix = "vect"; 4456 break; 4457 case vect_scalar_var: 4458 prefix = "stmp"; 4459 break; 4460 case vect_mask_var: 4461 prefix = "mask"; 4462 break; 4463 case vect_pointer_var: 4464 prefix = "vectp"; 4465 break; 4466 default: 4467 gcc_unreachable (); 4468 } 4469 4470 if (name) 4471 { 4472 char* tmp = concat (prefix, "_", name, NULL); 4473 new_vect_var = create_tmp_reg (type, tmp); 4474 free (tmp); 4475 } 4476 else 4477 new_vect_var = create_tmp_reg (type, prefix); 4478 4479 return new_vect_var; 4480 } 4481 4482 /* Like vect_get_new_vect_var but return an SSA name. */ 4483 4484 tree 4485 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4486 { 4487 const char *prefix; 4488 tree new_vect_var; 4489 4490 switch (var_kind) 4491 { 4492 case vect_simple_var: 4493 prefix = "vect"; 4494 break; 4495 case vect_scalar_var: 4496 prefix = "stmp"; 4497 break; 4498 case vect_pointer_var: 4499 prefix = "vectp"; 4500 break; 4501 default: 4502 gcc_unreachable (); 4503 } 4504 4505 if (name) 4506 { 4507 char* tmp = concat (prefix, "_", name, NULL); 4508 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4509 free (tmp); 4510 } 4511 else 4512 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4513 4514 return new_vect_var; 4515 } 4516 4517 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4518 4519 static void 4520 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr) 4521 { 4522 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4523 int misalign = DR_MISALIGNMENT (dr); 4524 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4525 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4526 else 4527 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4528 DR_TARGET_ALIGNMENT (dr), misalign); 4529 } 4530 4531 /* Function vect_create_addr_base_for_vector_ref. 4532 4533 Create an expression that computes the address of the first memory location 4534 that will be accessed for a data reference. 4535 4536 Input: 4537 STMT: The statement containing the data reference. 4538 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4539 OFFSET: Optional. If supplied, it is be added to the initial address. 4540 LOOP: Specify relative to which loop-nest should the address be computed. 4541 For example, when the dataref is in an inner-loop nested in an 4542 outer-loop that is now being vectorized, LOOP can be either the 4543 outer-loop, or the inner-loop. The first memory location accessed 4544 by the following dataref ('in' points to short): 4545 4546 for (i=0; i<N; i++) 4547 for (j=0; j<M; j++) 4548 s += in[i+j] 4549 4550 is as follows: 4551 if LOOP=i_loop: &in (relative to i_loop) 4552 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4553 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4554 initial address. Unlike OFFSET, which is number of elements to 4555 be added, BYTE_OFFSET is measured in bytes. 4556 4557 Output: 4558 1. Return an SSA_NAME whose value is the address of the memory location of 4559 the first vector of the data reference. 4560 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4561 these statement(s) which define the returned SSA_NAME. 4562 4563 FORNOW: We are only handling array accesses with step 1. */ 4564 4565 tree 4566 vect_create_addr_base_for_vector_ref (gimple *stmt, 4567 gimple_seq *new_stmt_list, 4568 tree offset, 4569 tree byte_offset) 4570 { 4571 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4572 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4573 const char *base_name; 4574 tree addr_base; 4575 tree dest; 4576 gimple_seq seq = NULL; 4577 tree vect_ptr_type; 4578 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4579 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4580 innermost_loop_behavior *drb = vect_dr_behavior (dr); 4581 4582 tree data_ref_base = unshare_expr (drb->base_address); 4583 tree base_offset = unshare_expr (drb->offset); 4584 tree init = unshare_expr (drb->init); 4585 4586 if (loop_vinfo) 4587 base_name = get_name (data_ref_base); 4588 else 4589 { 4590 base_offset = ssize_int (0); 4591 init = ssize_int (0); 4592 base_name = get_name (DR_REF (dr)); 4593 } 4594 4595 /* Create base_offset */ 4596 base_offset = size_binop (PLUS_EXPR, 4597 fold_convert (sizetype, base_offset), 4598 fold_convert (sizetype, init)); 4599 4600 if (offset) 4601 { 4602 offset = fold_build2 (MULT_EXPR, sizetype, 4603 fold_convert (sizetype, offset), step); 4604 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4605 base_offset, offset); 4606 } 4607 if (byte_offset) 4608 { 4609 byte_offset = fold_convert (sizetype, byte_offset); 4610 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4611 base_offset, byte_offset); 4612 } 4613 4614 /* base + base_offset */ 4615 if (loop_vinfo) 4616 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4617 else 4618 { 4619 addr_base = build1 (ADDR_EXPR, 4620 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4621 unshare_expr (DR_REF (dr))); 4622 } 4623 4624 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4625 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4626 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4627 gimple_seq_add_seq (new_stmt_list, seq); 4628 4629 if (DR_PTR_INFO (dr) 4630 && TREE_CODE (addr_base) == SSA_NAME 4631 && !SSA_NAME_PTR_INFO (addr_base)) 4632 { 4633 vect_duplicate_ssa_name_ptr_info (addr_base, dr); 4634 if (offset || byte_offset) 4635 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4636 } 4637 4638 if (dump_enabled_p ()) 4639 { 4640 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4641 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4642 dump_printf (MSG_NOTE, "\n"); 4643 } 4644 4645 return addr_base; 4646 } 4647 4648 4649 /* Function vect_create_data_ref_ptr. 4650 4651 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4652 location accessed in the loop by STMT, along with the def-use update 4653 chain to appropriately advance the pointer through the loop iterations. 4654 Also set aliasing information for the pointer. This pointer is used by 4655 the callers to this function to create a memory reference expression for 4656 vector load/store access. 4657 4658 Input: 4659 1. STMT: a stmt that references memory. Expected to be of the form 4660 GIMPLE_ASSIGN <name, data-ref> or 4661 GIMPLE_ASSIGN <data-ref, name>. 4662 2. AGGR_TYPE: the type of the reference, which should be either a vector 4663 or an array. 4664 3. AT_LOOP: the loop where the vector memref is to be created. 4665 4. OFFSET (optional): an offset to be added to the initial address accessed 4666 by the data-ref in STMT. 4667 5. BSI: location where the new stmts are to be placed if there is no loop 4668 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4669 pointing to the initial address. 4670 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4671 to the initial address accessed by the data-ref in STMT. This is 4672 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4673 in bytes. 4674 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4675 to the IV during each iteration of the loop. NULL says to move 4676 by one copy of AGGR_TYPE up or down, depending on the step of the 4677 data reference. 4678 4679 Output: 4680 1. Declare a new ptr to vector_type, and have it point to the base of the 4681 data reference (initial addressed accessed by the data reference). 4682 For example, for vector of type V8HI, the following code is generated: 4683 4684 v8hi *ap; 4685 ap = (v8hi *)initial_address; 4686 4687 if OFFSET is not supplied: 4688 initial_address = &a[init]; 4689 if OFFSET is supplied: 4690 initial_address = &a[init + OFFSET]; 4691 if BYTE_OFFSET is supplied: 4692 initial_address = &a[init] + BYTE_OFFSET; 4693 4694 Return the initial_address in INITIAL_ADDRESS. 4695 4696 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4697 update the pointer in each iteration of the loop. 4698 4699 Return the increment stmt that updates the pointer in PTR_INCR. 4700 4701 3. Set INV_P to true if the access pattern of the data reference in the 4702 vectorized loop is invariant. Set it to false otherwise. 4703 4704 4. Return the pointer. */ 4705 4706 tree 4707 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4708 tree offset, tree *initial_address, 4709 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4710 bool only_init, bool *inv_p, tree byte_offset, 4711 tree iv_step) 4712 { 4713 const char *base_name; 4714 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4716 struct loop *loop = NULL; 4717 bool nested_in_vect_loop = false; 4718 struct loop *containing_loop = NULL; 4719 tree aggr_ptr_type; 4720 tree aggr_ptr; 4721 tree new_temp; 4722 gimple_seq new_stmt_list = NULL; 4723 edge pe = NULL; 4724 basic_block new_bb; 4725 tree aggr_ptr_init; 4726 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4727 tree aptr; 4728 gimple_stmt_iterator incr_gsi; 4729 bool insert_after; 4730 tree indx_before_incr, indx_after_incr; 4731 gimple *incr; 4732 tree step; 4733 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4734 4735 gcc_assert (iv_step != NULL_TREE 4736 || TREE_CODE (aggr_type) == ARRAY_TYPE 4737 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4738 4739 if (loop_vinfo) 4740 { 4741 loop = LOOP_VINFO_LOOP (loop_vinfo); 4742 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4743 containing_loop = (gimple_bb (stmt))->loop_father; 4744 pe = loop_preheader_edge (loop); 4745 } 4746 else 4747 { 4748 gcc_assert (bb_vinfo); 4749 only_init = true; 4750 *ptr_incr = NULL; 4751 } 4752 4753 /* Check the step (evolution) of the load in LOOP, and record 4754 whether it's invariant. */ 4755 step = vect_dr_behavior (dr)->step; 4756 if (integer_zerop (step)) 4757 *inv_p = true; 4758 else 4759 *inv_p = false; 4760 4761 /* Create an expression for the first address accessed by this load 4762 in LOOP. */ 4763 base_name = get_name (DR_BASE_ADDRESS (dr)); 4764 4765 if (dump_enabled_p ()) 4766 { 4767 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4768 dump_printf_loc (MSG_NOTE, vect_location, 4769 "create %s-pointer variable to type: ", 4770 get_tree_code_name (TREE_CODE (aggr_type))); 4771 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4772 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4773 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4774 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4775 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4776 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4777 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4778 else 4779 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4780 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4781 dump_printf (MSG_NOTE, "\n"); 4782 } 4783 4784 /* (1) Create the new aggregate-pointer variable. 4785 Vector and array types inherit the alias set of their component 4786 type by default so we need to use a ref-all pointer if the data 4787 reference does not conflict with the created aggregated data 4788 reference because it is not addressable. */ 4789 bool need_ref_all = false; 4790 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4791 get_alias_set (DR_REF (dr)))) 4792 need_ref_all = true; 4793 /* Likewise for any of the data references in the stmt group. */ 4794 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4795 { 4796 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4797 do 4798 { 4799 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4800 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4801 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4802 get_alias_set (DR_REF (sdr)))) 4803 { 4804 need_ref_all = true; 4805 break; 4806 } 4807 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4808 } 4809 while (orig_stmt); 4810 } 4811 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4812 need_ref_all); 4813 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4814 4815 4816 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4817 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4818 def-use update cycles for the pointer: one relative to the outer-loop 4819 (LOOP), which is what steps (3) and (4) below do. The other is relative 4820 to the inner-loop (which is the inner-most loop containing the dataref), 4821 and this is done be step (5) below. 4822 4823 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4824 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4825 redundant. Steps (3),(4) create the following: 4826 4827 vp0 = &base_addr; 4828 LOOP: vp1 = phi(vp0,vp2) 4829 ... 4830 ... 4831 vp2 = vp1 + step 4832 goto LOOP 4833 4834 If there is an inner-loop nested in loop, then step (5) will also be 4835 applied, and an additional update in the inner-loop will be created: 4836 4837 vp0 = &base_addr; 4838 LOOP: vp1 = phi(vp0,vp2) 4839 ... 4840 inner: vp3 = phi(vp1,vp4) 4841 vp4 = vp3 + inner_step 4842 if () goto inner 4843 ... 4844 vp2 = vp1 + step 4845 if () goto LOOP */ 4846 4847 /* (2) Calculate the initial address of the aggregate-pointer, and set 4848 the aggregate-pointer to point to it before the loop. */ 4849 4850 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4851 4852 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4853 offset, byte_offset); 4854 if (new_stmt_list) 4855 { 4856 if (pe) 4857 { 4858 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4859 gcc_assert (!new_bb); 4860 } 4861 else 4862 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4863 } 4864 4865 *initial_address = new_temp; 4866 aggr_ptr_init = new_temp; 4867 4868 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4869 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4870 inner-loop nested in LOOP (during outer-loop vectorization). */ 4871 4872 /* No update in loop is required. */ 4873 if (only_init && (!loop_vinfo || at_loop == loop)) 4874 aptr = aggr_ptr_init; 4875 else 4876 { 4877 if (iv_step == NULL_TREE) 4878 { 4879 /* The step of the aggregate pointer is the type size. */ 4880 iv_step = TYPE_SIZE_UNIT (aggr_type); 4881 /* One exception to the above is when the scalar step of the load in 4882 LOOP is zero. In this case the step here is also zero. */ 4883 if (*inv_p) 4884 iv_step = size_zero_node; 4885 else if (tree_int_cst_sgn (step) == -1) 4886 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4887 } 4888 4889 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4890 4891 create_iv (aggr_ptr_init, 4892 fold_convert (aggr_ptr_type, iv_step), 4893 aggr_ptr, loop, &incr_gsi, insert_after, 4894 &indx_before_incr, &indx_after_incr); 4895 incr = gsi_stmt (incr_gsi); 4896 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4897 4898 /* Copy the points-to information if it exists. */ 4899 if (DR_PTR_INFO (dr)) 4900 { 4901 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4902 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4903 } 4904 if (ptr_incr) 4905 *ptr_incr = incr; 4906 4907 aptr = indx_before_incr; 4908 } 4909 4910 if (!nested_in_vect_loop || only_init) 4911 return aptr; 4912 4913 4914 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4915 nested in LOOP, if exists. */ 4916 4917 gcc_assert (nested_in_vect_loop); 4918 if (!only_init) 4919 { 4920 standard_iv_increment_position (containing_loop, &incr_gsi, 4921 &insert_after); 4922 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4923 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4924 &indx_after_incr); 4925 incr = gsi_stmt (incr_gsi); 4926 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4927 4928 /* Copy the points-to information if it exists. */ 4929 if (DR_PTR_INFO (dr)) 4930 { 4931 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr); 4932 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr); 4933 } 4934 if (ptr_incr) 4935 *ptr_incr = incr; 4936 4937 return indx_before_incr; 4938 } 4939 else 4940 gcc_unreachable (); 4941 } 4942 4943 4944 /* Function bump_vector_ptr 4945 4946 Increment a pointer (to a vector type) by vector-size. If requested, 4947 i.e. if PTR-INCR is given, then also connect the new increment stmt 4948 to the existing def-use update-chain of the pointer, by modifying 4949 the PTR_INCR as illustrated below: 4950 4951 The pointer def-use update-chain before this function: 4952 DATAREF_PTR = phi (p_0, p_2) 4953 .... 4954 PTR_INCR: p_2 = DATAREF_PTR + step 4955 4956 The pointer def-use update-chain after this function: 4957 DATAREF_PTR = phi (p_0, p_2) 4958 .... 4959 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4960 .... 4961 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4962 4963 Input: 4964 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4965 in the loop. 4966 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4967 the loop. The increment amount across iterations is expected 4968 to be vector_size. 4969 BSI - location where the new update stmt is to be placed. 4970 STMT - the original scalar memory-access stmt that is being vectorized. 4971 BUMP - optional. The offset by which to bump the pointer. If not given, 4972 the offset is assumed to be vector_size. 4973 4974 Output: Return NEW_DATAREF_PTR as illustrated above. 4975 4976 */ 4977 4978 tree 4979 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4980 gimple *stmt, tree bump) 4981 { 4982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4983 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4984 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4985 tree update = TYPE_SIZE_UNIT (vectype); 4986 gassign *incr_stmt; 4987 ssa_op_iter iter; 4988 use_operand_p use_p; 4989 tree new_dataref_ptr; 4990 4991 if (bump) 4992 update = bump; 4993 4994 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4995 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4996 else 4997 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4998 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4999 dataref_ptr, update); 5000 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 5001 5002 /* Copy the points-to information if it exists. */ 5003 if (DR_PTR_INFO (dr)) 5004 { 5005 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 5006 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 5007 } 5008 5009 if (!ptr_incr) 5010 return new_dataref_ptr; 5011 5012 /* Update the vector-pointer's cross-iteration increment. */ 5013 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 5014 { 5015 tree use = USE_FROM_PTR (use_p); 5016 5017 if (use == dataref_ptr) 5018 SET_USE (use_p, new_dataref_ptr); 5019 else 5020 gcc_assert (operand_equal_p (use, update, 0)); 5021 } 5022 5023 return new_dataref_ptr; 5024 } 5025 5026 5027 /* Copy memory reference info such as base/clique from the SRC reference 5028 to the DEST MEM_REF. */ 5029 5030 void 5031 vect_copy_ref_info (tree dest, tree src) 5032 { 5033 if (TREE_CODE (dest) != MEM_REF) 5034 return; 5035 5036 tree src_base = src; 5037 while (handled_component_p (src_base)) 5038 src_base = TREE_OPERAND (src_base, 0); 5039 if (TREE_CODE (src_base) != MEM_REF 5040 && TREE_CODE (src_base) != TARGET_MEM_REF) 5041 return; 5042 5043 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 5044 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 5045 } 5046 5047 5048 /* Function vect_create_destination_var. 5049 5050 Create a new temporary of type VECTYPE. */ 5051 5052 tree 5053 vect_create_destination_var (tree scalar_dest, tree vectype) 5054 { 5055 tree vec_dest; 5056 const char *name; 5057 char *new_name; 5058 tree type; 5059 enum vect_var_kind kind; 5060 5061 kind = vectype 5062 ? VECTOR_BOOLEAN_TYPE_P (vectype) 5063 ? vect_mask_var 5064 : vect_simple_var 5065 : vect_scalar_var; 5066 type = vectype ? vectype : TREE_TYPE (scalar_dest); 5067 5068 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 5069 5070 name = get_name (scalar_dest); 5071 if (name) 5072 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 5073 else 5074 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 5075 vec_dest = vect_get_new_vect_var (type, kind, new_name); 5076 free (new_name); 5077 5078 return vec_dest; 5079 } 5080 5081 /* Function vect_grouped_store_supported. 5082 5083 Returns TRUE if interleave high and interleave low permutations 5084 are supported, and FALSE otherwise. */ 5085 5086 bool 5087 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 5088 { 5089 machine_mode mode = TYPE_MODE (vectype); 5090 5091 /* vect_permute_store_chain requires the group size to be equal to 3 or 5092 be a power of two. */ 5093 if (count != 3 && exact_log2 (count) == -1) 5094 { 5095 if (dump_enabled_p ()) 5096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5097 "the size of the group of accesses" 5098 " is not a power of 2 or not eqaul to 3\n"); 5099 return false; 5100 } 5101 5102 /* Check that the permutation is supported. */ 5103 if (VECTOR_MODE_P (mode)) 5104 { 5105 unsigned int i; 5106 if (count == 3) 5107 { 5108 unsigned int j0 = 0, j1 = 0, j2 = 0; 5109 unsigned int i, j; 5110 5111 unsigned int nelt; 5112 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5113 { 5114 if (dump_enabled_p ()) 5115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5116 "cannot handle groups of 3 stores for" 5117 " variable-length vectors\n"); 5118 return false; 5119 } 5120 5121 vec_perm_builder sel (nelt, nelt, 1); 5122 sel.quick_grow (nelt); 5123 vec_perm_indices indices; 5124 for (j = 0; j < 3; j++) 5125 { 5126 int nelt0 = ((3 - j) * nelt) % 3; 5127 int nelt1 = ((3 - j) * nelt + 1) % 3; 5128 int nelt2 = ((3 - j) * nelt + 2) % 3; 5129 for (i = 0; i < nelt; i++) 5130 { 5131 if (3 * i + nelt0 < nelt) 5132 sel[3 * i + nelt0] = j0++; 5133 if (3 * i + nelt1 < nelt) 5134 sel[3 * i + nelt1] = nelt + j1++; 5135 if (3 * i + nelt2 < nelt) 5136 sel[3 * i + nelt2] = 0; 5137 } 5138 indices.new_vector (sel, 2, nelt); 5139 if (!can_vec_perm_const_p (mode, indices)) 5140 { 5141 if (dump_enabled_p ()) 5142 dump_printf (MSG_MISSED_OPTIMIZATION, 5143 "permutation op not supported by target.\n"); 5144 return false; 5145 } 5146 5147 for (i = 0; i < nelt; i++) 5148 { 5149 if (3 * i + nelt0 < nelt) 5150 sel[3 * i + nelt0] = 3 * i + nelt0; 5151 if (3 * i + nelt1 < nelt) 5152 sel[3 * i + nelt1] = 3 * i + nelt1; 5153 if (3 * i + nelt2 < nelt) 5154 sel[3 * i + nelt2] = nelt + j2++; 5155 } 5156 indices.new_vector (sel, 2, nelt); 5157 if (!can_vec_perm_const_p (mode, indices)) 5158 { 5159 if (dump_enabled_p ()) 5160 dump_printf (MSG_MISSED_OPTIMIZATION, 5161 "permutation op not supported by target.\n"); 5162 return false; 5163 } 5164 } 5165 return true; 5166 } 5167 else 5168 { 5169 /* If length is not equal to 3 then only power of 2 is supported. */ 5170 gcc_assert (pow2p_hwi (count)); 5171 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5172 5173 /* The encoding has 2 interleaved stepped patterns. */ 5174 vec_perm_builder sel (nelt, 2, 3); 5175 sel.quick_grow (6); 5176 for (i = 0; i < 3; i++) 5177 { 5178 sel[i * 2] = i; 5179 sel[i * 2 + 1] = i + nelt; 5180 } 5181 vec_perm_indices indices (sel, 2, nelt); 5182 if (can_vec_perm_const_p (mode, indices)) 5183 { 5184 for (i = 0; i < 6; i++) 5185 sel[i] += exact_div (nelt, 2); 5186 indices.new_vector (sel, 2, nelt); 5187 if (can_vec_perm_const_p (mode, indices)) 5188 return true; 5189 } 5190 } 5191 } 5192 5193 if (dump_enabled_p ()) 5194 dump_printf (MSG_MISSED_OPTIMIZATION, 5195 "permutaion op not supported by target.\n"); 5196 return false; 5197 } 5198 5199 5200 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5201 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5202 5203 bool 5204 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5205 bool masked_p) 5206 { 5207 if (masked_p) 5208 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5209 vec_mask_store_lanes_optab, 5210 vectype, count); 5211 else 5212 return vect_lanes_optab_supported_p ("vec_store_lanes", 5213 vec_store_lanes_optab, 5214 vectype, count); 5215 } 5216 5217 5218 /* Function vect_permute_store_chain. 5219 5220 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5221 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5222 the data correctly for the stores. Return the final references for stores 5223 in RESULT_CHAIN. 5224 5225 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5226 The input is 4 vectors each containing 8 elements. We assign a number to 5227 each element, the input sequence is: 5228 5229 1st vec: 0 1 2 3 4 5 6 7 5230 2nd vec: 8 9 10 11 12 13 14 15 5231 3rd vec: 16 17 18 19 20 21 22 23 5232 4th vec: 24 25 26 27 28 29 30 31 5233 5234 The output sequence should be: 5235 5236 1st vec: 0 8 16 24 1 9 17 25 5237 2nd vec: 2 10 18 26 3 11 19 27 5238 3rd vec: 4 12 20 28 5 13 21 30 5239 4th vec: 6 14 22 30 7 15 23 31 5240 5241 i.e., we interleave the contents of the four vectors in their order. 5242 5243 We use interleave_high/low instructions to create such output. The input of 5244 each interleave_high/low operation is two vectors: 5245 1st vec 2nd vec 5246 0 1 2 3 4 5 6 7 5247 the even elements of the result vector are obtained left-to-right from the 5248 high/low elements of the first vector. The odd elements of the result are 5249 obtained left-to-right from the high/low elements of the second vector. 5250 The output of interleave_high will be: 0 4 1 5 5251 and of interleave_low: 2 6 3 7 5252 5253 5254 The permutation is done in log LENGTH stages. In each stage interleave_high 5255 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5256 where the first argument is taken from the first half of DR_CHAIN and the 5257 second argument from it's second half. 5258 In our example, 5259 5260 I1: interleave_high (1st vec, 3rd vec) 5261 I2: interleave_low (1st vec, 3rd vec) 5262 I3: interleave_high (2nd vec, 4th vec) 5263 I4: interleave_low (2nd vec, 4th vec) 5264 5265 The output for the first stage is: 5266 5267 I1: 0 16 1 17 2 18 3 19 5268 I2: 4 20 5 21 6 22 7 23 5269 I3: 8 24 9 25 10 26 11 27 5270 I4: 12 28 13 29 14 30 15 31 5271 5272 The output of the second stage, i.e. the final result is: 5273 5274 I1: 0 8 16 24 1 9 17 25 5275 I2: 2 10 18 26 3 11 19 27 5276 I3: 4 12 20 28 5 13 21 30 5277 I4: 6 14 22 30 7 15 23 31. */ 5278 5279 void 5280 vect_permute_store_chain (vec<tree> dr_chain, 5281 unsigned int length, 5282 gimple *stmt, 5283 gimple_stmt_iterator *gsi, 5284 vec<tree> *result_chain) 5285 { 5286 tree vect1, vect2, high, low; 5287 gimple *perm_stmt; 5288 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5289 tree perm_mask_low, perm_mask_high; 5290 tree data_ref; 5291 tree perm3_mask_low, perm3_mask_high; 5292 unsigned int i, j, n, log_length = exact_log2 (length); 5293 5294 result_chain->quick_grow (length); 5295 memcpy (result_chain->address (), dr_chain.address (), 5296 length * sizeof (tree)); 5297 5298 if (length == 3) 5299 { 5300 /* vect_grouped_store_supported ensures that this is constant. */ 5301 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5302 unsigned int j0 = 0, j1 = 0, j2 = 0; 5303 5304 vec_perm_builder sel (nelt, nelt, 1); 5305 sel.quick_grow (nelt); 5306 vec_perm_indices indices; 5307 for (j = 0; j < 3; j++) 5308 { 5309 int nelt0 = ((3 - j) * nelt) % 3; 5310 int nelt1 = ((3 - j) * nelt + 1) % 3; 5311 int nelt2 = ((3 - j) * nelt + 2) % 3; 5312 5313 for (i = 0; i < nelt; i++) 5314 { 5315 if (3 * i + nelt0 < nelt) 5316 sel[3 * i + nelt0] = j0++; 5317 if (3 * i + nelt1 < nelt) 5318 sel[3 * i + nelt1] = nelt + j1++; 5319 if (3 * i + nelt2 < nelt) 5320 sel[3 * i + nelt2] = 0; 5321 } 5322 indices.new_vector (sel, 2, nelt); 5323 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5324 5325 for (i = 0; i < nelt; i++) 5326 { 5327 if (3 * i + nelt0 < nelt) 5328 sel[3 * i + nelt0] = 3 * i + nelt0; 5329 if (3 * i + nelt1 < nelt) 5330 sel[3 * i + nelt1] = 3 * i + nelt1; 5331 if (3 * i + nelt2 < nelt) 5332 sel[3 * i + nelt2] = nelt + j2++; 5333 } 5334 indices.new_vector (sel, 2, nelt); 5335 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5336 5337 vect1 = dr_chain[0]; 5338 vect2 = dr_chain[1]; 5339 5340 /* Create interleaving stmt: 5341 low = VEC_PERM_EXPR <vect1, vect2, 5342 {j, nelt, *, j + 1, nelt + j + 1, *, 5343 j + 2, nelt + j + 2, *, ...}> */ 5344 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5345 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5346 vect2, perm3_mask_low); 5347 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5348 5349 vect1 = data_ref; 5350 vect2 = dr_chain[2]; 5351 /* Create interleaving stmt: 5352 low = VEC_PERM_EXPR <vect1, vect2, 5353 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5354 6, 7, nelt + j + 2, ...}> */ 5355 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5356 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5357 vect2, perm3_mask_high); 5358 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5359 (*result_chain)[j] = data_ref; 5360 } 5361 } 5362 else 5363 { 5364 /* If length is not equal to 3 then only power of 2 is supported. */ 5365 gcc_assert (pow2p_hwi (length)); 5366 5367 /* The encoding has 2 interleaved stepped patterns. */ 5368 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5369 vec_perm_builder sel (nelt, 2, 3); 5370 sel.quick_grow (6); 5371 for (i = 0; i < 3; i++) 5372 { 5373 sel[i * 2] = i; 5374 sel[i * 2 + 1] = i + nelt; 5375 } 5376 vec_perm_indices indices (sel, 2, nelt); 5377 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5378 5379 for (i = 0; i < 6; i++) 5380 sel[i] += exact_div (nelt, 2); 5381 indices.new_vector (sel, 2, nelt); 5382 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5383 5384 for (i = 0, n = log_length; i < n; i++) 5385 { 5386 for (j = 0; j < length/2; j++) 5387 { 5388 vect1 = dr_chain[j]; 5389 vect2 = dr_chain[j+length/2]; 5390 5391 /* Create interleaving stmt: 5392 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5393 ...}> */ 5394 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5395 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5396 vect2, perm_mask_high); 5397 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5398 (*result_chain)[2*j] = high; 5399 5400 /* Create interleaving stmt: 5401 low = VEC_PERM_EXPR <vect1, vect2, 5402 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5403 ...}> */ 5404 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5405 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5406 vect2, perm_mask_low); 5407 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5408 (*result_chain)[2*j+1] = low; 5409 } 5410 memcpy (dr_chain.address (), result_chain->address (), 5411 length * sizeof (tree)); 5412 } 5413 } 5414 } 5415 5416 /* Function vect_setup_realignment 5417 5418 This function is called when vectorizing an unaligned load using 5419 the dr_explicit_realign[_optimized] scheme. 5420 This function generates the following code at the loop prolog: 5421 5422 p = initial_addr; 5423 x msq_init = *(floor(p)); # prolog load 5424 realignment_token = call target_builtin; 5425 loop: 5426 x msq = phi (msq_init, ---) 5427 5428 The stmts marked with x are generated only for the case of 5429 dr_explicit_realign_optimized. 5430 5431 The code above sets up a new (vector) pointer, pointing to the first 5432 location accessed by STMT, and a "floor-aligned" load using that pointer. 5433 It also generates code to compute the "realignment-token" (if the relevant 5434 target hook was defined), and creates a phi-node at the loop-header bb 5435 whose arguments are the result of the prolog-load (created by this 5436 function) and the result of a load that takes place in the loop (to be 5437 created by the caller to this function). 5438 5439 For the case of dr_explicit_realign_optimized: 5440 The caller to this function uses the phi-result (msq) to create the 5441 realignment code inside the loop, and sets up the missing phi argument, 5442 as follows: 5443 loop: 5444 msq = phi (msq_init, lsq) 5445 lsq = *(floor(p')); # load in loop 5446 result = realign_load (msq, lsq, realignment_token); 5447 5448 For the case of dr_explicit_realign: 5449 loop: 5450 msq = *(floor(p)); # load in loop 5451 p' = p + (VS-1); 5452 lsq = *(floor(p')); # load in loop 5453 result = realign_load (msq, lsq, realignment_token); 5454 5455 Input: 5456 STMT - (scalar) load stmt to be vectorized. This load accesses 5457 a memory location that may be unaligned. 5458 BSI - place where new code is to be inserted. 5459 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5460 is used. 5461 5462 Output: 5463 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5464 target hook, if defined. 5465 Return value - the result of the loop-header phi node. */ 5466 5467 tree 5468 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5469 tree *realignment_token, 5470 enum dr_alignment_support alignment_support_scheme, 5471 tree init_addr, 5472 struct loop **at_loop) 5473 { 5474 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5475 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5476 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5477 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5478 struct loop *loop = NULL; 5479 edge pe = NULL; 5480 tree scalar_dest = gimple_assign_lhs (stmt); 5481 tree vec_dest; 5482 gimple *inc; 5483 tree ptr; 5484 tree data_ref; 5485 basic_block new_bb; 5486 tree msq_init = NULL_TREE; 5487 tree new_temp; 5488 gphi *phi_stmt; 5489 tree msq = NULL_TREE; 5490 gimple_seq stmts = NULL; 5491 bool inv_p; 5492 bool compute_in_loop = false; 5493 bool nested_in_vect_loop = false; 5494 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5495 struct loop *loop_for_initial_load = NULL; 5496 5497 if (loop_vinfo) 5498 { 5499 loop = LOOP_VINFO_LOOP (loop_vinfo); 5500 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5501 } 5502 5503 gcc_assert (alignment_support_scheme == dr_explicit_realign 5504 || alignment_support_scheme == dr_explicit_realign_optimized); 5505 5506 /* We need to generate three things: 5507 1. the misalignment computation 5508 2. the extra vector load (for the optimized realignment scheme). 5509 3. the phi node for the two vectors from which the realignment is 5510 done (for the optimized realignment scheme). */ 5511 5512 /* 1. Determine where to generate the misalignment computation. 5513 5514 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5515 calculation will be generated by this function, outside the loop (in the 5516 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5517 caller, inside the loop. 5518 5519 Background: If the misalignment remains fixed throughout the iterations of 5520 the loop, then both realignment schemes are applicable, and also the 5521 misalignment computation can be done outside LOOP. This is because we are 5522 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5523 are a multiple of VS (the Vector Size), and therefore the misalignment in 5524 different vectorized LOOP iterations is always the same. 5525 The problem arises only if the memory access is in an inner-loop nested 5526 inside LOOP, which is now being vectorized using outer-loop vectorization. 5527 This is the only case when the misalignment of the memory access may not 5528 remain fixed throughout the iterations of the inner-loop (as explained in 5529 detail in vect_supportable_dr_alignment). In this case, not only is the 5530 optimized realignment scheme not applicable, but also the misalignment 5531 computation (and generation of the realignment token that is passed to 5532 REALIGN_LOAD) have to be done inside the loop. 5533 5534 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5535 or not, which in turn determines if the misalignment is computed inside 5536 the inner-loop, or outside LOOP. */ 5537 5538 if (init_addr != NULL_TREE || !loop_vinfo) 5539 { 5540 compute_in_loop = true; 5541 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5542 } 5543 5544 5545 /* 2. Determine where to generate the extra vector load. 5546 5547 For the optimized realignment scheme, instead of generating two vector 5548 loads in each iteration, we generate a single extra vector load in the 5549 preheader of the loop, and in each iteration reuse the result of the 5550 vector load from the previous iteration. In case the memory access is in 5551 an inner-loop nested inside LOOP, which is now being vectorized using 5552 outer-loop vectorization, we need to determine whether this initial vector 5553 load should be generated at the preheader of the inner-loop, or can be 5554 generated at the preheader of LOOP. If the memory access has no evolution 5555 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5556 to be generated inside LOOP (in the preheader of the inner-loop). */ 5557 5558 if (nested_in_vect_loop) 5559 { 5560 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5561 bool invariant_in_outerloop = 5562 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5563 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5564 } 5565 else 5566 loop_for_initial_load = loop; 5567 if (at_loop) 5568 *at_loop = loop_for_initial_load; 5569 5570 if (loop_for_initial_load) 5571 pe = loop_preheader_edge (loop_for_initial_load); 5572 5573 /* 3. For the case of the optimized realignment, create the first vector 5574 load at the loop preheader. */ 5575 5576 if (alignment_support_scheme == dr_explicit_realign_optimized) 5577 { 5578 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5579 gassign *new_stmt; 5580 5581 gcc_assert (!compute_in_loop); 5582 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5583 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5584 NULL_TREE, &init_addr, NULL, &inc, 5585 true, &inv_p); 5586 if (TREE_CODE (ptr) == SSA_NAME) 5587 new_temp = copy_ssa_name (ptr); 5588 else 5589 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5590 unsigned int align = DR_TARGET_ALIGNMENT (dr); 5591 new_stmt = gimple_build_assign 5592 (new_temp, BIT_AND_EXPR, ptr, 5593 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align)); 5594 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5595 gcc_assert (!new_bb); 5596 data_ref 5597 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5598 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5599 vect_copy_ref_info (data_ref, DR_REF (dr)); 5600 new_stmt = gimple_build_assign (vec_dest, data_ref); 5601 new_temp = make_ssa_name (vec_dest, new_stmt); 5602 gimple_assign_set_lhs (new_stmt, new_temp); 5603 if (pe) 5604 { 5605 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5606 gcc_assert (!new_bb); 5607 } 5608 else 5609 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5610 5611 msq_init = gimple_assign_lhs (new_stmt); 5612 } 5613 5614 /* 4. Create realignment token using a target builtin, if available. 5615 It is done either inside the containing loop, or before LOOP (as 5616 determined above). */ 5617 5618 if (targetm.vectorize.builtin_mask_for_load) 5619 { 5620 gcall *new_stmt; 5621 tree builtin_decl; 5622 5623 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5624 if (!init_addr) 5625 { 5626 /* Generate the INIT_ADDR computation outside LOOP. */ 5627 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5628 NULL_TREE); 5629 if (loop) 5630 { 5631 pe = loop_preheader_edge (loop); 5632 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5633 gcc_assert (!new_bb); 5634 } 5635 else 5636 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5637 } 5638 5639 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5640 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5641 vec_dest = 5642 vect_create_destination_var (scalar_dest, 5643 gimple_call_return_type (new_stmt)); 5644 new_temp = make_ssa_name (vec_dest, new_stmt); 5645 gimple_call_set_lhs (new_stmt, new_temp); 5646 5647 if (compute_in_loop) 5648 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5649 else 5650 { 5651 /* Generate the misalignment computation outside LOOP. */ 5652 pe = loop_preheader_edge (loop); 5653 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5654 gcc_assert (!new_bb); 5655 } 5656 5657 *realignment_token = gimple_call_lhs (new_stmt); 5658 5659 /* The result of the CALL_EXPR to this builtin is determined from 5660 the value of the parameter and no global variables are touched 5661 which makes the builtin a "const" function. Requiring the 5662 builtin to have the "const" attribute makes it unnecessary 5663 to call mark_call_clobbered. */ 5664 gcc_assert (TREE_READONLY (builtin_decl)); 5665 } 5666 5667 if (alignment_support_scheme == dr_explicit_realign) 5668 return msq; 5669 5670 gcc_assert (!compute_in_loop); 5671 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5672 5673 5674 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5675 5676 pe = loop_preheader_edge (containing_loop); 5677 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5678 msq = make_ssa_name (vec_dest); 5679 phi_stmt = create_phi_node (msq, containing_loop->header); 5680 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5681 5682 return msq; 5683 } 5684 5685 5686 /* Function vect_grouped_load_supported. 5687 5688 COUNT is the size of the load group (the number of statements plus the 5689 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5690 only one statement, with a gap of COUNT - 1. 5691 5692 Returns true if a suitable permute exists. */ 5693 5694 bool 5695 vect_grouped_load_supported (tree vectype, bool single_element_p, 5696 unsigned HOST_WIDE_INT count) 5697 { 5698 machine_mode mode = TYPE_MODE (vectype); 5699 5700 /* If this is single-element interleaving with an element distance 5701 that leaves unused vector loads around punt - we at least create 5702 very sub-optimal code in that case (and blow up memory, 5703 see PR65518). */ 5704 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5705 { 5706 if (dump_enabled_p ()) 5707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5708 "single-element interleaving not supported " 5709 "for not adjacent vector loads\n"); 5710 return false; 5711 } 5712 5713 /* vect_permute_load_chain requires the group size to be equal to 3 or 5714 be a power of two. */ 5715 if (count != 3 && exact_log2 (count) == -1) 5716 { 5717 if (dump_enabled_p ()) 5718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5719 "the size of the group of accesses" 5720 " is not a power of 2 or not equal to 3\n"); 5721 return false; 5722 } 5723 5724 /* Check that the permutation is supported. */ 5725 if (VECTOR_MODE_P (mode)) 5726 { 5727 unsigned int i, j; 5728 if (count == 3) 5729 { 5730 unsigned int nelt; 5731 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5732 { 5733 if (dump_enabled_p ()) 5734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5735 "cannot handle groups of 3 loads for" 5736 " variable-length vectors\n"); 5737 return false; 5738 } 5739 5740 vec_perm_builder sel (nelt, nelt, 1); 5741 sel.quick_grow (nelt); 5742 vec_perm_indices indices; 5743 unsigned int k; 5744 for (k = 0; k < 3; k++) 5745 { 5746 for (i = 0; i < nelt; i++) 5747 if (3 * i + k < 2 * nelt) 5748 sel[i] = 3 * i + k; 5749 else 5750 sel[i] = 0; 5751 indices.new_vector (sel, 2, nelt); 5752 if (!can_vec_perm_const_p (mode, indices)) 5753 { 5754 if (dump_enabled_p ()) 5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5756 "shuffle of 3 loads is not supported by" 5757 " target\n"); 5758 return false; 5759 } 5760 for (i = 0, j = 0; i < nelt; i++) 5761 if (3 * i + k < 2 * nelt) 5762 sel[i] = i; 5763 else 5764 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5765 indices.new_vector (sel, 2, nelt); 5766 if (!can_vec_perm_const_p (mode, indices)) 5767 { 5768 if (dump_enabled_p ()) 5769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5770 "shuffle of 3 loads is not supported by" 5771 " target\n"); 5772 return false; 5773 } 5774 } 5775 return true; 5776 } 5777 else 5778 { 5779 /* If length is not equal to 3 then only power of 2 is supported. */ 5780 gcc_assert (pow2p_hwi (count)); 5781 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5782 5783 /* The encoding has a single stepped pattern. */ 5784 vec_perm_builder sel (nelt, 1, 3); 5785 sel.quick_grow (3); 5786 for (i = 0; i < 3; i++) 5787 sel[i] = i * 2; 5788 vec_perm_indices indices (sel, 2, nelt); 5789 if (can_vec_perm_const_p (mode, indices)) 5790 { 5791 for (i = 0; i < 3; i++) 5792 sel[i] = i * 2 + 1; 5793 indices.new_vector (sel, 2, nelt); 5794 if (can_vec_perm_const_p (mode, indices)) 5795 return true; 5796 } 5797 } 5798 } 5799 5800 if (dump_enabled_p ()) 5801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5802 "extract even/odd not supported by target\n"); 5803 return false; 5804 } 5805 5806 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5807 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5808 5809 bool 5810 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5811 bool masked_p) 5812 { 5813 if (masked_p) 5814 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5815 vec_mask_load_lanes_optab, 5816 vectype, count); 5817 else 5818 return vect_lanes_optab_supported_p ("vec_load_lanes", 5819 vec_load_lanes_optab, 5820 vectype, count); 5821 } 5822 5823 /* Function vect_permute_load_chain. 5824 5825 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5826 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5827 the input data correctly. Return the final references for loads in 5828 RESULT_CHAIN. 5829 5830 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5831 The input is 4 vectors each containing 8 elements. We assign a number to each 5832 element, the input sequence is: 5833 5834 1st vec: 0 1 2 3 4 5 6 7 5835 2nd vec: 8 9 10 11 12 13 14 15 5836 3rd vec: 16 17 18 19 20 21 22 23 5837 4th vec: 24 25 26 27 28 29 30 31 5838 5839 The output sequence should be: 5840 5841 1st vec: 0 4 8 12 16 20 24 28 5842 2nd vec: 1 5 9 13 17 21 25 29 5843 3rd vec: 2 6 10 14 18 22 26 30 5844 4th vec: 3 7 11 15 19 23 27 31 5845 5846 i.e., the first output vector should contain the first elements of each 5847 interleaving group, etc. 5848 5849 We use extract_even/odd instructions to create such output. The input of 5850 each extract_even/odd operation is two vectors 5851 1st vec 2nd vec 5852 0 1 2 3 4 5 6 7 5853 5854 and the output is the vector of extracted even/odd elements. The output of 5855 extract_even will be: 0 2 4 6 5856 and of extract_odd: 1 3 5 7 5857 5858 5859 The permutation is done in log LENGTH stages. In each stage extract_even 5860 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5861 their order. In our example, 5862 5863 E1: extract_even (1st vec, 2nd vec) 5864 E2: extract_odd (1st vec, 2nd vec) 5865 E3: extract_even (3rd vec, 4th vec) 5866 E4: extract_odd (3rd vec, 4th vec) 5867 5868 The output for the first stage will be: 5869 5870 E1: 0 2 4 6 8 10 12 14 5871 E2: 1 3 5 7 9 11 13 15 5872 E3: 16 18 20 22 24 26 28 30 5873 E4: 17 19 21 23 25 27 29 31 5874 5875 In order to proceed and create the correct sequence for the next stage (or 5876 for the correct output, if the second stage is the last one, as in our 5877 example), we first put the output of extract_even operation and then the 5878 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5879 The input for the second stage is: 5880 5881 1st vec (E1): 0 2 4 6 8 10 12 14 5882 2nd vec (E3): 16 18 20 22 24 26 28 30 5883 3rd vec (E2): 1 3 5 7 9 11 13 15 5884 4th vec (E4): 17 19 21 23 25 27 29 31 5885 5886 The output of the second stage: 5887 5888 E1: 0 4 8 12 16 20 24 28 5889 E2: 2 6 10 14 18 22 26 30 5890 E3: 1 5 9 13 17 21 25 29 5891 E4: 3 7 11 15 19 23 27 31 5892 5893 And RESULT_CHAIN after reordering: 5894 5895 1st vec (E1): 0 4 8 12 16 20 24 28 5896 2nd vec (E3): 1 5 9 13 17 21 25 29 5897 3rd vec (E2): 2 6 10 14 18 22 26 30 5898 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5899 5900 static void 5901 vect_permute_load_chain (vec<tree> dr_chain, 5902 unsigned int length, 5903 gimple *stmt, 5904 gimple_stmt_iterator *gsi, 5905 vec<tree> *result_chain) 5906 { 5907 tree data_ref, first_vect, second_vect; 5908 tree perm_mask_even, perm_mask_odd; 5909 tree perm3_mask_low, perm3_mask_high; 5910 gimple *perm_stmt; 5911 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5912 unsigned int i, j, log_length = exact_log2 (length); 5913 5914 result_chain->quick_grow (length); 5915 memcpy (result_chain->address (), dr_chain.address (), 5916 length * sizeof (tree)); 5917 5918 if (length == 3) 5919 { 5920 /* vect_grouped_load_supported ensures that this is constant. */ 5921 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5922 unsigned int k; 5923 5924 vec_perm_builder sel (nelt, nelt, 1); 5925 sel.quick_grow (nelt); 5926 vec_perm_indices indices; 5927 for (k = 0; k < 3; k++) 5928 { 5929 for (i = 0; i < nelt; i++) 5930 if (3 * i + k < 2 * nelt) 5931 sel[i] = 3 * i + k; 5932 else 5933 sel[i] = 0; 5934 indices.new_vector (sel, 2, nelt); 5935 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5936 5937 for (i = 0, j = 0; i < nelt; i++) 5938 if (3 * i + k < 2 * nelt) 5939 sel[i] = i; 5940 else 5941 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5942 indices.new_vector (sel, 2, nelt); 5943 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5944 5945 first_vect = dr_chain[0]; 5946 second_vect = dr_chain[1]; 5947 5948 /* Create interleaving stmt (low part of): 5949 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5950 ...}> */ 5951 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5952 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5953 second_vect, perm3_mask_low); 5954 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5955 5956 /* Create interleaving stmt (high part of): 5957 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5958 ...}> */ 5959 first_vect = data_ref; 5960 second_vect = dr_chain[2]; 5961 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5962 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5963 second_vect, perm3_mask_high); 5964 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5965 (*result_chain)[k] = data_ref; 5966 } 5967 } 5968 else 5969 { 5970 /* If length is not equal to 3 then only power of 2 is supported. */ 5971 gcc_assert (pow2p_hwi (length)); 5972 5973 /* The encoding has a single stepped pattern. */ 5974 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5975 vec_perm_builder sel (nelt, 1, 3); 5976 sel.quick_grow (3); 5977 for (i = 0; i < 3; ++i) 5978 sel[i] = i * 2; 5979 vec_perm_indices indices (sel, 2, nelt); 5980 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 5981 5982 for (i = 0; i < 3; ++i) 5983 sel[i] = i * 2 + 1; 5984 indices.new_vector (sel, 2, nelt); 5985 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 5986 5987 for (i = 0; i < log_length; i++) 5988 { 5989 for (j = 0; j < length; j += 2) 5990 { 5991 first_vect = dr_chain[j]; 5992 second_vect = dr_chain[j+1]; 5993 5994 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5997 first_vect, second_vect, 5998 perm_mask_even); 5999 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6000 (*result_chain)[j/2] = data_ref; 6001 6002 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 6003 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 6004 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6005 first_vect, second_vect, 6006 perm_mask_odd); 6007 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6008 (*result_chain)[j/2+length/2] = data_ref; 6009 } 6010 memcpy (dr_chain.address (), result_chain->address (), 6011 length * sizeof (tree)); 6012 } 6013 } 6014 } 6015 6016 /* Function vect_shift_permute_load_chain. 6017 6018 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 6019 sequence of stmts to reorder the input data accordingly. 6020 Return the final references for loads in RESULT_CHAIN. 6021 Return true if successed, false otherwise. 6022 6023 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 6024 The input is 3 vectors each containing 8 elements. We assign a 6025 number to each element, the input sequence is: 6026 6027 1st vec: 0 1 2 3 4 5 6 7 6028 2nd vec: 8 9 10 11 12 13 14 15 6029 3rd vec: 16 17 18 19 20 21 22 23 6030 6031 The output sequence should be: 6032 6033 1st vec: 0 3 6 9 12 15 18 21 6034 2nd vec: 1 4 7 10 13 16 19 22 6035 3rd vec: 2 5 8 11 14 17 20 23 6036 6037 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 6038 6039 First we shuffle all 3 vectors to get correct elements order: 6040 6041 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 6042 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 6043 3rd vec: (16 19 22) (17 20 23) (18 21) 6044 6045 Next we unite and shift vector 3 times: 6046 6047 1st step: 6048 shift right by 6 the concatenation of: 6049 "1st vec" and "2nd vec" 6050 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 6051 "2nd vec" and "3rd vec" 6052 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 6053 "3rd vec" and "1st vec" 6054 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 6055 | New vectors | 6056 6057 So that now new vectors are: 6058 6059 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 6060 2nd vec: (10 13) (16 19 22) (17 20 23) 6061 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 6062 6063 2nd step: 6064 shift right by 5 the concatenation of: 6065 "1st vec" and "3rd vec" 6066 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 6067 "2nd vec" and "1st vec" 6068 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 6069 "3rd vec" and "2nd vec" 6070 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 6071 | New vectors | 6072 6073 So that now new vectors are: 6074 6075 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 6076 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 6077 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 6078 6079 3rd step: 6080 shift right by 5 the concatenation of: 6081 "1st vec" and "1st vec" 6082 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 6083 shift right by 3 the concatenation of: 6084 "2nd vec" and "2nd vec" 6085 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 6086 | New vectors | 6087 6088 So that now all vectors are READY: 6089 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6090 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6091 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6092 6093 This algorithm is faster than one in vect_permute_load_chain if: 6094 1. "shift of a concatination" is faster than general permutation. 6095 This is usually so. 6096 2. The TARGET machine can't execute vector instructions in parallel. 6097 This is because each step of the algorithm depends on previous. 6098 The algorithm in vect_permute_load_chain is much more parallel. 6099 6100 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6101 */ 6102 6103 static bool 6104 vect_shift_permute_load_chain (vec<tree> dr_chain, 6105 unsigned int length, 6106 gimple *stmt, 6107 gimple_stmt_iterator *gsi, 6108 vec<tree> *result_chain) 6109 { 6110 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6111 tree perm2_mask1, perm2_mask2, perm3_mask; 6112 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6113 gimple *perm_stmt; 6114 6115 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 6116 unsigned int i; 6117 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6118 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6119 6120 unsigned HOST_WIDE_INT nelt, vf; 6121 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6122 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6123 /* Not supported for variable-length vectors. */ 6124 return false; 6125 6126 vec_perm_builder sel (nelt, nelt, 1); 6127 sel.quick_grow (nelt); 6128 6129 result_chain->quick_grow (length); 6130 memcpy (result_chain->address (), dr_chain.address (), 6131 length * sizeof (tree)); 6132 6133 if (pow2p_hwi (length) && vf > 4) 6134 { 6135 unsigned int j, log_length = exact_log2 (length); 6136 for (i = 0; i < nelt / 2; ++i) 6137 sel[i] = i * 2; 6138 for (i = 0; i < nelt / 2; ++i) 6139 sel[nelt / 2 + i] = i * 2 + 1; 6140 vec_perm_indices indices (sel, 2, nelt); 6141 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6142 { 6143 if (dump_enabled_p ()) 6144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6145 "shuffle of 2 fields structure is not \ 6146 supported by target\n"); 6147 return false; 6148 } 6149 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6150 6151 for (i = 0; i < nelt / 2; ++i) 6152 sel[i] = i * 2 + 1; 6153 for (i = 0; i < nelt / 2; ++i) 6154 sel[nelt / 2 + i] = i * 2; 6155 indices.new_vector (sel, 2, nelt); 6156 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6157 { 6158 if (dump_enabled_p ()) 6159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6160 "shuffle of 2 fields structure is not \ 6161 supported by target\n"); 6162 return false; 6163 } 6164 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6165 6166 /* Generating permutation constant to shift all elements. 6167 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6168 for (i = 0; i < nelt; i++) 6169 sel[i] = nelt / 2 + i; 6170 indices.new_vector (sel, 2, nelt); 6171 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6172 { 6173 if (dump_enabled_p ()) 6174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6175 "shift permutation is not supported by target\n"); 6176 return false; 6177 } 6178 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6179 6180 /* Generating permutation constant to select vector from 2. 6181 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6182 for (i = 0; i < nelt / 2; i++) 6183 sel[i] = i; 6184 for (i = nelt / 2; i < nelt; i++) 6185 sel[i] = nelt + i; 6186 indices.new_vector (sel, 2, nelt); 6187 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6188 { 6189 if (dump_enabled_p ()) 6190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6191 "select is not supported by target\n"); 6192 return false; 6193 } 6194 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6195 6196 for (i = 0; i < log_length; i++) 6197 { 6198 for (j = 0; j < length; j += 2) 6199 { 6200 first_vect = dr_chain[j]; 6201 second_vect = dr_chain[j + 1]; 6202 6203 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6204 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6205 first_vect, first_vect, 6206 perm2_mask1); 6207 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6208 vect[0] = data_ref; 6209 6210 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6211 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6212 second_vect, second_vect, 6213 perm2_mask2); 6214 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6215 vect[1] = data_ref; 6216 6217 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6218 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6219 vect[0], vect[1], shift1_mask); 6220 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6221 (*result_chain)[j/2 + length/2] = data_ref; 6222 6223 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6224 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6225 vect[0], vect[1], select_mask); 6226 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6227 (*result_chain)[j/2] = data_ref; 6228 } 6229 memcpy (dr_chain.address (), result_chain->address (), 6230 length * sizeof (tree)); 6231 } 6232 return true; 6233 } 6234 if (length == 3 && vf > 2) 6235 { 6236 unsigned int k = 0, l = 0; 6237 6238 /* Generating permutation constant to get all elements in rigth order. 6239 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6240 for (i = 0; i < nelt; i++) 6241 { 6242 if (3 * k + (l % 3) >= nelt) 6243 { 6244 k = 0; 6245 l += (3 - (nelt % 3)); 6246 } 6247 sel[i] = 3 * k + (l % 3); 6248 k++; 6249 } 6250 vec_perm_indices indices (sel, 2, nelt); 6251 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6252 { 6253 if (dump_enabled_p ()) 6254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6255 "shuffle of 3 fields structure is not \ 6256 supported by target\n"); 6257 return false; 6258 } 6259 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6260 6261 /* Generating permutation constant to shift all elements. 6262 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6263 for (i = 0; i < nelt; i++) 6264 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6265 indices.new_vector (sel, 2, nelt); 6266 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6267 { 6268 if (dump_enabled_p ()) 6269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6270 "shift permutation is not supported by target\n"); 6271 return false; 6272 } 6273 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6274 6275 /* Generating permutation constant to shift all elements. 6276 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6277 for (i = 0; i < nelt; i++) 6278 sel[i] = 2 * (nelt / 3) + 1 + i; 6279 indices.new_vector (sel, 2, nelt); 6280 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6281 { 6282 if (dump_enabled_p ()) 6283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6284 "shift permutation is not supported by target\n"); 6285 return false; 6286 } 6287 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6288 6289 /* Generating permutation constant to shift all elements. 6290 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6291 for (i = 0; i < nelt; i++) 6292 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6293 indices.new_vector (sel, 2, nelt); 6294 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6295 { 6296 if (dump_enabled_p ()) 6297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6298 "shift permutation is not supported by target\n"); 6299 return false; 6300 } 6301 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6302 6303 /* Generating permutation constant to shift all elements. 6304 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6305 for (i = 0; i < nelt; i++) 6306 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6307 indices.new_vector (sel, 2, nelt); 6308 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6309 { 6310 if (dump_enabled_p ()) 6311 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6312 "shift permutation is not supported by target\n"); 6313 return false; 6314 } 6315 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6316 6317 for (k = 0; k < 3; k++) 6318 { 6319 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6320 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6321 dr_chain[k], dr_chain[k], 6322 perm3_mask); 6323 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6324 vect[k] = data_ref; 6325 } 6326 6327 for (k = 0; k < 3; k++) 6328 { 6329 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6330 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6331 vect[k % 3], vect[(k + 1) % 3], 6332 shift1_mask); 6333 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6334 vect_shift[k] = data_ref; 6335 } 6336 6337 for (k = 0; k < 3; k++) 6338 { 6339 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6340 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6341 vect_shift[(4 - k) % 3], 6342 vect_shift[(3 - k) % 3], 6343 shift2_mask); 6344 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6345 vect[k] = data_ref; 6346 } 6347 6348 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6349 6350 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6351 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6352 vect[0], shift3_mask); 6353 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6354 (*result_chain)[nelt % 3] = data_ref; 6355 6356 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6357 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6358 vect[1], shift4_mask); 6359 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 6360 (*result_chain)[0] = data_ref; 6361 return true; 6362 } 6363 return false; 6364 } 6365 6366 /* Function vect_transform_grouped_load. 6367 6368 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6369 to perform their permutation and ascribe the result vectorized statements to 6370 the scalar statements. 6371 */ 6372 6373 void 6374 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 6375 gimple_stmt_iterator *gsi) 6376 { 6377 machine_mode mode; 6378 vec<tree> result_chain = vNULL; 6379 6380 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6381 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6382 vectors, that are ready for vector computation. */ 6383 result_chain.create (size); 6384 6385 /* If reassociation width for vector type is 2 or greater target machine can 6386 execute 2 or more vector instructions in parallel. Otherwise try to 6387 get chain for loads group using vect_shift_permute_load_chain. */ 6388 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 6389 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6390 || pow2p_hwi (size) 6391 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 6392 gsi, &result_chain)) 6393 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 6394 vect_record_grouped_load_vectors (stmt, result_chain); 6395 result_chain.release (); 6396 } 6397 6398 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6399 generated as part of the vectorization of STMT. Assign the statement 6400 for each vector to the associated scalar statement. */ 6401 6402 void 6403 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6404 { 6405 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6406 gimple *next_stmt, *new_stmt; 6407 unsigned int i, gap_count; 6408 tree tmp_data_ref; 6409 6410 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6411 Since we scan the chain starting from it's first node, their order 6412 corresponds the order of data-refs in RESULT_CHAIN. */ 6413 next_stmt = first_stmt; 6414 gap_count = 1; 6415 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6416 { 6417 if (!next_stmt) 6418 break; 6419 6420 /* Skip the gaps. Loads created for the gaps will be removed by dead 6421 code elimination pass later. No need to check for the first stmt in 6422 the group, since it always exists. 6423 GROUP_GAP is the number of steps in elements from the previous 6424 access (if there is no gap GROUP_GAP is 1). We skip loads that 6425 correspond to the gaps. */ 6426 if (next_stmt != first_stmt 6427 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6428 { 6429 gap_count++; 6430 continue; 6431 } 6432 6433 while (next_stmt) 6434 { 6435 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6436 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6437 copies, and we put the new vector statement in the first available 6438 RELATED_STMT. */ 6439 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6440 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6441 else 6442 { 6443 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6444 { 6445 gimple *prev_stmt = 6446 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6447 gimple *rel_stmt = 6448 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6449 while (rel_stmt) 6450 { 6451 prev_stmt = rel_stmt; 6452 rel_stmt = 6453 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6454 } 6455 6456 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6457 new_stmt; 6458 } 6459 } 6460 6461 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6462 gap_count = 1; 6463 /* If NEXT_STMT accesses the same DR as the previous statement, 6464 put the same TMP_DATA_REF as its vectorized statement; otherwise 6465 get the next data-ref from RESULT_CHAIN. */ 6466 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6467 break; 6468 } 6469 } 6470 } 6471 6472 /* Function vect_force_dr_alignment_p. 6473 6474 Returns whether the alignment of a DECL can be forced to be aligned 6475 on ALIGNMENT bit boundary. */ 6476 6477 bool 6478 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6479 { 6480 if (!VAR_P (decl)) 6481 return false; 6482 6483 if (decl_in_symtab_p (decl) 6484 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6485 return false; 6486 6487 if (TREE_STATIC (decl)) 6488 return (alignment <= MAX_OFILE_ALIGNMENT); 6489 else 6490 return (alignment <= MAX_STACK_ALIGNMENT); 6491 } 6492 6493 6494 /* Return whether the data reference DR is supported with respect to its 6495 alignment. 6496 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6497 it is aligned, i.e., check if it is possible to vectorize it with different 6498 alignment. */ 6499 6500 enum dr_alignment_support 6501 vect_supportable_dr_alignment (struct data_reference *dr, 6502 bool check_aligned_accesses) 6503 { 6504 gimple *stmt = DR_STMT (dr); 6505 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6506 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6507 machine_mode mode = TYPE_MODE (vectype); 6508 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6509 struct loop *vect_loop = NULL; 6510 bool nested_in_vect_loop = false; 6511 6512 if (aligned_access_p (dr) && !check_aligned_accesses) 6513 return dr_aligned; 6514 6515 /* For now assume all conditional loads/stores support unaligned 6516 access without any special code. */ 6517 if (is_gimple_call (stmt) 6518 && gimple_call_internal_p (stmt) 6519 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6520 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6521 return dr_unaligned_supported; 6522 6523 if (loop_vinfo) 6524 { 6525 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6526 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6527 } 6528 6529 /* Possibly unaligned access. */ 6530 6531 /* We can choose between using the implicit realignment scheme (generating 6532 a misaligned_move stmt) and the explicit realignment scheme (generating 6533 aligned loads with a REALIGN_LOAD). There are two variants to the 6534 explicit realignment scheme: optimized, and unoptimized. 6535 We can optimize the realignment only if the step between consecutive 6536 vector loads is equal to the vector size. Since the vector memory 6537 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6538 is guaranteed that the misalignment amount remains the same throughout the 6539 execution of the vectorized loop. Therefore, we can create the 6540 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6541 at the loop preheader. 6542 6543 However, in the case of outer-loop vectorization, when vectorizing a 6544 memory access in the inner-loop nested within the LOOP that is now being 6545 vectorized, while it is guaranteed that the misalignment of the 6546 vectorized memory access will remain the same in different outer-loop 6547 iterations, it is *not* guaranteed that is will remain the same throughout 6548 the execution of the inner-loop. This is because the inner-loop advances 6549 with the original scalar step (and not in steps of VS). If the inner-loop 6550 step happens to be a multiple of VS, then the misalignment remains fixed 6551 and we can use the optimized realignment scheme. For example: 6552 6553 for (i=0; i<N; i++) 6554 for (j=0; j<M; j++) 6555 s += a[i+j]; 6556 6557 When vectorizing the i-loop in the above example, the step between 6558 consecutive vector loads is 1, and so the misalignment does not remain 6559 fixed across the execution of the inner-loop, and the realignment cannot 6560 be optimized (as illustrated in the following pseudo vectorized loop): 6561 6562 for (i=0; i<N; i+=4) 6563 for (j=0; j<M; j++){ 6564 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6565 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6566 // (assuming that we start from an aligned address). 6567 } 6568 6569 We therefore have to use the unoptimized realignment scheme: 6570 6571 for (i=0; i<N; i+=4) 6572 for (j=k; j<M; j+=4) 6573 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6574 // that the misalignment of the initial address is 6575 // 0). 6576 6577 The loop can then be vectorized as follows: 6578 6579 for (k=0; k<4; k++){ 6580 rt = get_realignment_token (&vp[k]); 6581 for (i=0; i<N; i+=4){ 6582 v1 = vp[i+k]; 6583 for (j=k; j<M; j+=4){ 6584 v2 = vp[i+j+VS-1]; 6585 va = REALIGN_LOAD <v1,v2,rt>; 6586 vs += va; 6587 v1 = v2; 6588 } 6589 } 6590 } */ 6591 6592 if (DR_IS_READ (dr)) 6593 { 6594 bool is_packed = false; 6595 tree type = (TREE_TYPE (DR_REF (dr))); 6596 6597 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6598 && (!targetm.vectorize.builtin_mask_for_load 6599 || targetm.vectorize.builtin_mask_for_load ())) 6600 { 6601 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6602 6603 /* If we are doing SLP then the accesses need not have the 6604 same alignment, instead it depends on the SLP group size. */ 6605 if (loop_vinfo 6606 && STMT_SLP_TYPE (stmt_info) 6607 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6608 * GROUP_SIZE (vinfo_for_stmt 6609 (GROUP_FIRST_ELEMENT (stmt_info))), 6610 TYPE_VECTOR_SUBPARTS (vectype))) 6611 ; 6612 else if (!loop_vinfo 6613 || (nested_in_vect_loop 6614 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6615 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6616 return dr_explicit_realign; 6617 else 6618 return dr_explicit_realign_optimized; 6619 } 6620 if (!known_alignment_for_access_p (dr)) 6621 is_packed = not_size_aligned (DR_REF (dr)); 6622 6623 if (targetm.vectorize.support_vector_misalignment 6624 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6625 /* Can't software pipeline the loads, but can at least do them. */ 6626 return dr_unaligned_supported; 6627 } 6628 else 6629 { 6630 bool is_packed = false; 6631 tree type = (TREE_TYPE (DR_REF (dr))); 6632 6633 if (!known_alignment_for_access_p (dr)) 6634 is_packed = not_size_aligned (DR_REF (dr)); 6635 6636 if (targetm.vectorize.support_vector_misalignment 6637 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6638 return dr_unaligned_supported; 6639 } 6640 6641 /* Unsupported. */ 6642 return dr_unaligned_unsupported; 6643 } 6644