1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2017 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 #include "internal-fn.h" 54 55 /* Return true if load- or store-lanes optab OPTAB is implemented for 56 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 57 58 static bool 59 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 60 tree vectype, unsigned HOST_WIDE_INT count) 61 { 62 machine_mode mode, array_mode; 63 bool limit_p; 64 65 mode = TYPE_MODE (vectype); 66 limit_p = !targetm.array_mode_supported_p (mode, count); 67 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), 68 MODE_INT, limit_p); 69 70 if (array_mode == BLKmode) 71 { 72 if (dump_enabled_p ()) 73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n", 75 GET_MODE_NAME (mode), count); 76 return false; 77 } 78 79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 80 { 81 if (dump_enabled_p ()) 82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 83 "cannot use %s<%s><%s>\n", name, 84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 85 return false; 86 } 87 88 if (dump_enabled_p ()) 89 dump_printf_loc (MSG_NOTE, vect_location, 90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 91 GET_MODE_NAME (mode)); 92 93 return true; 94 } 95 96 97 /* Return the smallest scalar part of STMT. 98 This is used to determine the vectype of the stmt. We generally set the 99 vectype according to the type of the result (lhs). For stmts whose 100 result-type is different than the type of the arguments (e.g., demotion, 101 promotion), vectype will be reset appropriately (later). Note that we have 102 to visit the smallest datatype in this function, because that determines the 103 VF. If the smallest datatype in the loop is present only as the rhs of a 104 promotion operation - we'd miss it. 105 Such a case, where a variable of this datatype does not appear in the lhs 106 anywhere in the loop, can only occur if it's an invariant: e.g.: 107 'int_x = (int) short_inv', which we'd expect to have been optimized away by 108 invariant motion. However, we cannot rely on invariant motion to always 109 take invariants out of the loop, and so in the case of promotion we also 110 have to check the rhs. 111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 112 types. */ 113 114 tree 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 116 HOST_WIDE_INT *rhs_size_unit) 117 { 118 tree scalar_type = gimple_expr_type (stmt); 119 HOST_WIDE_INT lhs, rhs; 120 121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 122 123 if (is_gimple_assign (stmt) 124 && (gimple_assign_cast_p (stmt) 125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 128 { 129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 130 131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 132 if (rhs < lhs) 133 scalar_type = rhs_type; 134 } 135 else if (gcall *call = dyn_cast <gcall *> (stmt)) 136 { 137 unsigned int i = 0; 138 if (gimple_call_internal_p (call)) 139 { 140 internal_fn ifn = gimple_call_internal_fn (call); 141 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) 142 /* gimple_expr_type already picked the type of the loaded 143 or stored data. */ 144 i = ~0U; 145 else if (internal_fn_mask_index (ifn) == 0) 146 i = 1; 147 } 148 if (i < gimple_call_num_args (call)) 149 { 150 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); 151 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) 152 { 153 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 154 if (rhs < lhs) 155 scalar_type = rhs_type; 156 } 157 } 158 } 159 160 *lhs_size_unit = lhs; 161 *rhs_size_unit = rhs; 162 return scalar_type; 163 } 164 165 166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 167 tested at run-time. Return TRUE if DDR was successfully inserted. 168 Return false if versioning is not supported. */ 169 170 static bool 171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 172 { 173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 174 175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 176 return false; 177 178 if (dump_enabled_p ()) 179 { 180 dump_printf_loc (MSG_NOTE, vect_location, 181 "mark for run-time aliasing test between "); 182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 183 dump_printf (MSG_NOTE, " and "); 184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 185 dump_printf (MSG_NOTE, "\n"); 186 } 187 188 if (optimize_loop_nest_for_size_p (loop)) 189 { 190 if (dump_enabled_p ()) 191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 192 "versioning not supported when optimizing" 193 " for size.\n"); 194 return false; 195 } 196 197 /* FORNOW: We don't support versioning with outer-loop vectorization. */ 198 if (loop->inner) 199 { 200 if (dump_enabled_p ()) 201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 202 "versioning not yet supported for outer-loops.\n"); 203 return false; 204 } 205 206 /* FORNOW: We don't support creating runtime alias tests for non-constant 207 step. */ 208 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST 209 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) 210 { 211 if (dump_enabled_p ()) 212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 213 "versioning not yet supported for non-constant " 214 "step\n"); 215 return false; 216 } 217 218 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 219 return true; 220 } 221 222 223 /* Return true if we know that the order of vectorized STMT_A and 224 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 225 At least one of the statements is a write. */ 226 227 static bool 228 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 229 { 230 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 231 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 232 233 /* Single statements are always kept in their original order. */ 234 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 235 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 236 return true; 237 238 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 239 group are emitted at the position of the last scalar load and all 240 stores in a group are emitted at the position of the last scalar store. 241 Compute that position and check whether the resulting order matches 242 the current one. */ 243 gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 244 if (last_a) 245 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; 246 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 247 last_a = get_later_stmt (last_a, s); 248 else 249 last_a = stmt_a; 250 gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 251 if (last_b) 252 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; 253 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 254 last_b = get_later_stmt (last_b, s); 255 else 256 last_b = stmt_b; 257 return ((get_later_stmt (last_a, last_b) == last_a) 258 == (get_later_stmt (stmt_a, stmt_b) == stmt_a)); 259 } 260 261 262 /* Function vect_analyze_data_ref_dependence. 263 264 Return TRUE if there (might) exist a dependence between a memory-reference 265 DRA and a memory-reference DRB. When versioning for alias may check a 266 dependence at run-time, return FALSE. Adjust *MAX_VF according to 267 the data dependence. */ 268 269 static bool 270 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 271 loop_vec_info loop_vinfo, int *max_vf) 272 { 273 unsigned int i; 274 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 275 struct data_reference *dra = DDR_A (ddr); 276 struct data_reference *drb = DDR_B (ddr); 277 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 278 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 279 lambda_vector dist_v; 280 unsigned int loop_depth; 281 282 /* In loop analysis all data references should be vectorizable. */ 283 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 284 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 285 gcc_unreachable (); 286 287 /* Independent data accesses. */ 288 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 289 return false; 290 291 if (dra == drb 292 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 293 return false; 294 295 /* We do not have to consider dependences between accesses that belong 296 to the same group. */ 297 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 298 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)) 299 return false; 300 301 /* Even if we have an anti-dependence then, as the vectorized loop covers at 302 least two scalar iterations, there is always also a true dependence. 303 As the vectorizer does not re-order loads and stores we can ignore 304 the anti-dependence if TBAA can disambiguate both DRs similar to the 305 case with known negative distance anti-dependences (positive 306 distance anti-dependences would violate TBAA constraints). */ 307 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 308 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 309 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 310 get_alias_set (DR_REF (drb)))) 311 return false; 312 313 /* Unknown data dependence. */ 314 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 315 { 316 /* If user asserted safelen consecutive iterations can be 317 executed concurrently, assume independence. */ 318 if (loop->safelen >= 2) 319 { 320 if (loop->safelen < *max_vf) 321 *max_vf = loop->safelen; 322 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 323 return false; 324 } 325 326 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 327 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 328 { 329 if (dump_enabled_p ()) 330 { 331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 332 "versioning for alias not supported for: " 333 "can't determine dependence between "); 334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 335 DR_REF (dra)); 336 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 338 DR_REF (drb)); 339 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 340 } 341 return true; 342 } 343 344 if (dump_enabled_p ()) 345 { 346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 347 "versioning for alias required: " 348 "can't determine dependence between "); 349 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 350 DR_REF (dra)); 351 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 353 DR_REF (drb)); 354 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 355 } 356 357 /* Add to list of ddrs that need to be tested at run-time. */ 358 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 359 } 360 361 /* Known data dependence. */ 362 if (DDR_NUM_DIST_VECTS (ddr) == 0) 363 { 364 /* If user asserted safelen consecutive iterations can be 365 executed concurrently, assume independence. */ 366 if (loop->safelen >= 2) 367 { 368 if (loop->safelen < *max_vf) 369 *max_vf = loop->safelen; 370 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 371 return false; 372 } 373 374 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 375 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 376 { 377 if (dump_enabled_p ()) 378 { 379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 380 "versioning for alias not supported for: " 381 "bad dist vector for "); 382 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 383 DR_REF (dra)); 384 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 385 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 386 DR_REF (drb)); 387 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 388 } 389 return true; 390 } 391 392 if (dump_enabled_p ()) 393 { 394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 395 "versioning for alias required: " 396 "bad dist vector for "); 397 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 398 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 399 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 400 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 401 } 402 /* Add to list of ddrs that need to be tested at run-time. */ 403 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 404 } 405 406 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 407 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 408 { 409 int dist = dist_v[loop_depth]; 410 411 if (dump_enabled_p ()) 412 dump_printf_loc (MSG_NOTE, vect_location, 413 "dependence distance = %d.\n", dist); 414 415 if (dist == 0) 416 { 417 if (dump_enabled_p ()) 418 { 419 dump_printf_loc (MSG_NOTE, vect_location, 420 "dependence distance == 0 between "); 421 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 422 dump_printf (MSG_NOTE, " and "); 423 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 424 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 425 } 426 427 /* When we perform grouped accesses and perform implicit CSE 428 by detecting equal accesses and doing disambiguation with 429 runtime alias tests like for 430 .. = a[i]; 431 .. = a[i+1]; 432 a[i] = ..; 433 a[i+1] = ..; 434 *p = ..; 435 .. = a[i]; 436 .. = a[i+1]; 437 where we will end up loading { a[i], a[i+1] } once, make 438 sure that inserting group loads before the first load and 439 stores after the last store will do the right thing. 440 Similar for groups like 441 a[i] = ...; 442 ... = a[i]; 443 a[i+1] = ...; 444 where loads from the group interleave with the store. */ 445 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 446 { 447 if (dump_enabled_p ()) 448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 449 "READ_WRITE dependence in interleaving." 450 "\n"); 451 return true; 452 } 453 454 unsigned int step_prec = TYPE_PRECISION (TREE_TYPE (DR_STEP (dra))); 455 if (loop->safelen < 2 456 && !expr_not_equal_to (DR_STEP (dra), wi::zero (step_prec))) 457 { 458 if (dump_enabled_p ()) 459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 460 "step could be zero.\n"); 461 return true; 462 } 463 464 continue; 465 } 466 467 if (dist > 0 && DDR_REVERSED_P (ddr)) 468 { 469 /* If DDR_REVERSED_P the order of the data-refs in DDR was 470 reversed (to make distance vector positive), and the actual 471 distance is negative. */ 472 if (dump_enabled_p ()) 473 dump_printf_loc (MSG_NOTE, vect_location, 474 "dependence distance negative.\n"); 475 /* When doing outer loop vectorization, we need to check if there is 476 a backward dependence at the inner loop level if the dependence 477 at the outer loop is reversed. See PR81740. */ 478 if (nested_in_vect_loop_p (loop, DR_STMT (dra)) 479 || nested_in_vect_loop_p (loop, DR_STMT (drb))) 480 { 481 unsigned inner_depth = index_in_loop_nest (loop->inner->num, 482 DDR_LOOP_NEST (ddr)); 483 if (dist_v[inner_depth] < 0) 484 return true; 485 } 486 /* Record a negative dependence distance to later limit the 487 amount of stmt copying / unrolling we can perform. 488 Only need to handle read-after-write dependence. */ 489 if (DR_IS_READ (drb) 490 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 491 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 492 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 493 continue; 494 } 495 496 if (abs (dist) >= 2 497 && abs (dist) < *max_vf) 498 { 499 /* The dependence distance requires reduction of the maximal 500 vectorization factor. */ 501 *max_vf = abs (dist); 502 if (dump_enabled_p ()) 503 dump_printf_loc (MSG_NOTE, vect_location, 504 "adjusting maximal vectorization factor to %i\n", 505 *max_vf); 506 } 507 508 if (abs (dist) >= *max_vf) 509 { 510 /* Dependence distance does not create dependence, as far as 511 vectorization is concerned, in this case. */ 512 if (dump_enabled_p ()) 513 dump_printf_loc (MSG_NOTE, vect_location, 514 "dependence distance >= VF.\n"); 515 continue; 516 } 517 518 if (dump_enabled_p ()) 519 { 520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 521 "not vectorized, possible dependence " 522 "between data-refs "); 523 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 524 dump_printf (MSG_NOTE, " and "); 525 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 526 dump_printf (MSG_NOTE, "\n"); 527 } 528 529 return true; 530 } 531 532 return false; 533 } 534 535 /* Function vect_analyze_data_ref_dependences. 536 537 Examine all the data references in the loop, and make sure there do not 538 exist any data dependences between them. Set *MAX_VF according to 539 the maximum vectorization factor the data dependences allow. */ 540 541 bool 542 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf) 543 { 544 unsigned int i; 545 struct data_dependence_relation *ddr; 546 547 if (dump_enabled_p ()) 548 dump_printf_loc (MSG_NOTE, vect_location, 549 "=== vect_analyze_data_ref_dependences ===\n"); 550 551 LOOP_VINFO_DDRS (loop_vinfo) 552 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 553 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 554 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 555 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 556 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 557 &LOOP_VINFO_DDRS (loop_vinfo), 558 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 559 return false; 560 561 /* For epilogues we either have no aliases or alias versioning 562 was applied to original loop. Therefore we may just get max_vf 563 using VF of original loop. */ 564 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 565 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo); 566 else 567 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 568 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 569 return false; 570 571 return true; 572 } 573 574 575 /* Function vect_slp_analyze_data_ref_dependence. 576 577 Return TRUE if there (might) exist a dependence between a memory-reference 578 DRA and a memory-reference DRB. When versioning for alias may check a 579 dependence at run-time, return FALSE. Adjust *MAX_VF according to 580 the data dependence. */ 581 582 static bool 583 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 584 { 585 struct data_reference *dra = DDR_A (ddr); 586 struct data_reference *drb = DDR_B (ddr); 587 588 /* We need to check dependences of statements marked as unvectorizable 589 as well, they still can prohibit vectorization. */ 590 591 /* Independent data accesses. */ 592 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 593 return false; 594 595 if (dra == drb) 596 return false; 597 598 /* Read-read is OK. */ 599 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 600 return false; 601 602 /* If dra and drb are part of the same interleaving chain consider 603 them independent. */ 604 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 605 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 606 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 607 return false; 608 609 /* Unknown data dependence. */ 610 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 611 { 612 if (dump_enabled_p ()) 613 { 614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 615 "can't determine dependence between "); 616 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 617 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 618 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 619 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 620 } 621 } 622 else if (dump_enabled_p ()) 623 { 624 dump_printf_loc (MSG_NOTE, vect_location, 625 "determined dependence between "); 626 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 627 dump_printf (MSG_NOTE, " and "); 628 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 629 dump_printf (MSG_NOTE, "\n"); 630 } 631 632 return true; 633 } 634 635 636 /* Analyze dependences involved in the transform of SLP NODE. STORES 637 contain the vector of scalar stores of this instance if we are 638 disambiguating the loads. */ 639 640 static bool 641 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 642 vec<gimple *> stores, gimple *last_store) 643 { 644 /* This walks over all stmts involved in the SLP load/store done 645 in NODE verifying we can sink them up to the last stmt in the 646 group. */ 647 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 648 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 649 { 650 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 651 if (access == last_access) 652 continue; 653 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 654 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 655 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 656 { 657 gimple *stmt = gsi_stmt (gsi); 658 if (! gimple_vuse (stmt) 659 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 660 continue; 661 662 /* If we couldn't record a (single) data reference for this 663 stmt we have to give up. */ 664 /* ??? Here and below if dependence analysis fails we can resort 665 to the alias oracle which can handle more kinds of stmts. */ 666 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 667 if (!dr_b) 668 return false; 669 670 bool dependent = false; 671 /* If we run into a store of this same instance (we've just 672 marked those) then delay dependence checking until we run 673 into the last store because this is where it will have 674 been sunk to (and we verify if we can do that as well). */ 675 if (gimple_visited_p (stmt)) 676 { 677 if (stmt != last_store) 678 continue; 679 unsigned i; 680 gimple *store; 681 FOR_EACH_VEC_ELT (stores, i, store) 682 { 683 data_reference *store_dr 684 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 685 ddr_p ddr = initialize_data_dependence_relation 686 (dr_a, store_dr, vNULL); 687 dependent = vect_slp_analyze_data_ref_dependence (ddr); 688 free_dependence_relation (ddr); 689 if (dependent) 690 break; 691 } 692 } 693 else 694 { 695 ddr_p ddr = initialize_data_dependence_relation (dr_a, 696 dr_b, vNULL); 697 dependent = vect_slp_analyze_data_ref_dependence (ddr); 698 free_dependence_relation (ddr); 699 } 700 if (dependent) 701 return false; 702 } 703 } 704 return true; 705 } 706 707 708 /* Function vect_analyze_data_ref_dependences. 709 710 Examine all the data references in the basic-block, and make sure there 711 do not exist any data dependences between them. Set *MAX_VF according to 712 the maximum vectorization factor the data dependences allow. */ 713 714 bool 715 vect_slp_analyze_instance_dependence (slp_instance instance) 716 { 717 if (dump_enabled_p ()) 718 dump_printf_loc (MSG_NOTE, vect_location, 719 "=== vect_slp_analyze_instance_dependence ===\n"); 720 721 /* The stores of this instance are at the root of the SLP tree. */ 722 slp_tree store = SLP_INSTANCE_TREE (instance); 723 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 724 store = NULL; 725 726 /* Verify we can sink stores to the vectorized stmt insert location. */ 727 gimple *last_store = NULL; 728 if (store) 729 { 730 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 731 return false; 732 733 /* Mark stores in this instance and remember the last one. */ 734 last_store = vect_find_last_scalar_stmt_in_slp (store); 735 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 736 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 737 } 738 739 bool res = true; 740 741 /* Verify we can sink loads to the vectorized stmt insert location, 742 special-casing stores of this instance. */ 743 slp_tree load; 744 unsigned int i; 745 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 746 if (! vect_slp_analyze_node_dependences (instance, load, 747 store 748 ? SLP_TREE_SCALAR_STMTS (store) 749 : vNULL, last_store)) 750 { 751 res = false; 752 break; 753 } 754 755 /* Unset the visited flag. */ 756 if (store) 757 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 758 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 759 760 return res; 761 } 762 763 /* Function vect_compute_data_ref_alignment 764 765 Compute the misalignment of the data reference DR. 766 767 Output: 768 1. If during the misalignment computation it is found that the data reference 769 cannot be vectorized then false is returned. 770 2. DR_MISALIGNMENT (DR) is defined. 771 772 FOR NOW: No analysis is actually performed. Misalignment is calculated 773 only for trivial cases. TODO. */ 774 775 bool 776 vect_compute_data_ref_alignment (struct data_reference *dr) 777 { 778 gimple *stmt = DR_STMT (dr); 779 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 780 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 781 struct loop *loop = NULL; 782 tree ref = DR_REF (dr); 783 tree vectype; 784 tree base, base_addr; 785 tree misalign = NULL_TREE; 786 tree aligned_to; 787 tree step; 788 unsigned HOST_WIDE_INT alignment; 789 790 if (dump_enabled_p ()) 791 dump_printf_loc (MSG_NOTE, vect_location, 792 "vect_compute_data_ref_alignment:\n"); 793 794 if (loop_vinfo) 795 loop = LOOP_VINFO_LOOP (loop_vinfo); 796 797 /* Initialize misalignment to unknown. */ 798 SET_DR_MISALIGNMENT (dr, -1); 799 800 if (tree_fits_shwi_p (DR_STEP (dr))) 801 misalign = DR_INIT (dr); 802 aligned_to = DR_ALIGNED_TO (dr); 803 base_addr = DR_BASE_ADDRESS (dr); 804 vectype = STMT_VINFO_VECTYPE (stmt_info); 805 806 /* In case the dataref is in an inner-loop of the loop that is being 807 vectorized (LOOP), we use the base and misalignment information 808 relative to the outer-loop (LOOP). This is ok only if the misalignment 809 stays the same throughout the execution of the inner-loop, which is why 810 we have to check that the stride of the dataref in the inner-loop evenly 811 divides by the vector size. */ 812 if (loop && nested_in_vect_loop_p (loop, stmt)) 813 { 814 tree step = DR_STEP (dr); 815 816 if (tree_fits_shwi_p (step) 817 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) 818 { 819 if (dump_enabled_p ()) 820 dump_printf_loc (MSG_NOTE, vect_location, 821 "inner step divides the vector-size.\n"); 822 misalign = STMT_VINFO_DR_INIT (stmt_info); 823 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); 824 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); 825 } 826 else 827 { 828 if (dump_enabled_p ()) 829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 830 "inner step doesn't divide the vector-size.\n"); 831 misalign = NULL_TREE; 832 } 833 } 834 835 /* Similarly we can only use base and misalignment information relative to 836 an innermost loop if the misalignment stays the same throughout the 837 execution of the loop. As above, this is the case if the stride of 838 the dataref evenly divides by the vector size. */ 839 else 840 { 841 tree step = DR_STEP (dr); 842 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; 843 844 if (tree_fits_shwi_p (step) 845 && ((tree_to_shwi (step) * vf) 846 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)) 847 { 848 if (dump_enabled_p ()) 849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 850 "step doesn't divide the vector-size.\n"); 851 misalign = NULL_TREE; 852 } 853 } 854 855 /* To look at alignment of the base we have to preserve an inner MEM_REF 856 as that carries alignment information of the actual access. */ 857 base = ref; 858 while (handled_component_p (base)) 859 base = TREE_OPERAND (base, 0); 860 unsigned int base_alignment = 0; 861 unsigned HOST_WIDE_INT base_bitpos; 862 get_object_alignment_1 (base, &base_alignment, &base_bitpos); 863 /* As data-ref analysis strips the MEM_REF down to its base operand 864 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to 865 adjust things to make base_alignment valid as the alignment of 866 DR_BASE_ADDRESS. */ 867 if (TREE_CODE (base) == MEM_REF) 868 { 869 /* Note all this only works if DR_BASE_ADDRESS is the same as 870 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored 871 in other offsets. We need to rework DR to compute the alingment 872 of DR_BASE_ADDRESS as long as all information is still available. */ 873 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0)) 874 { 875 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT; 876 base_bitpos &= (base_alignment - 1); 877 } 878 else 879 base_bitpos = BITS_PER_UNIT; 880 } 881 if (base_bitpos != 0) 882 base_alignment = base_bitpos & -base_bitpos; 883 /* Also look at the alignment of the base address DR analysis 884 computed. */ 885 unsigned int base_addr_alignment = get_pointer_alignment (base_addr); 886 if (base_addr_alignment > base_alignment) 887 base_alignment = base_addr_alignment; 888 889 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype))) 890 DR_VECT_AUX (dr)->base_element_aligned = true; 891 892 alignment = TYPE_ALIGN_UNIT (vectype); 893 894 if ((compare_tree_int (aligned_to, alignment) < 0) 895 || !misalign) 896 { 897 if (dump_enabled_p ()) 898 { 899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 900 "Unknown alignment for access: "); 901 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 902 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 903 } 904 return true; 905 } 906 907 if (base_alignment < TYPE_ALIGN (vectype)) 908 { 909 base = base_addr; 910 if (TREE_CODE (base) == ADDR_EXPR) 911 base = TREE_OPERAND (base, 0); 912 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))) 913 { 914 if (dump_enabled_p ()) 915 { 916 dump_printf_loc (MSG_NOTE, vect_location, 917 "can't force alignment of ref: "); 918 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 919 dump_printf (MSG_NOTE, "\n"); 920 } 921 return true; 922 } 923 924 if (DECL_USER_ALIGN (base)) 925 { 926 if (dump_enabled_p ()) 927 { 928 dump_printf_loc (MSG_NOTE, vect_location, 929 "not forcing alignment of user-aligned " 930 "variable: "); 931 dump_generic_expr (MSG_NOTE, TDF_SLIM, base); 932 dump_printf (MSG_NOTE, "\n"); 933 } 934 return true; 935 } 936 937 /* Force the alignment of the decl. 938 NOTE: This is the only change to the code we make during 939 the analysis phase, before deciding to vectorize the loop. */ 940 if (dump_enabled_p ()) 941 { 942 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 943 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 944 dump_printf (MSG_NOTE, "\n"); 945 } 946 947 DR_VECT_AUX (dr)->base_decl = base; 948 DR_VECT_AUX (dr)->base_misaligned = true; 949 DR_VECT_AUX (dr)->base_element_aligned = true; 950 } 951 952 if (loop && nested_in_vect_loop_p (loop, stmt)) 953 step = STMT_VINFO_DR_STEP (stmt_info); 954 else 955 step = DR_STEP (dr); 956 /* If this is a backward running DR then first access in the larger 957 vectype actually is N-1 elements before the address in the DR. 958 Adjust misalign accordingly. */ 959 if (tree_int_cst_sgn (step) < 0) 960 { 961 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); 962 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, 963 otherwise we wouldn't be here. */ 964 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step); 965 /* PLUS because STEP was negative. */ 966 misalign = size_binop (PLUS_EXPR, misalign, offset); 967 } 968 969 SET_DR_MISALIGNMENT (dr, 970 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ()); 971 972 if (dump_enabled_p ()) 973 { 974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 975 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 976 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 977 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 978 } 979 980 return true; 981 } 982 983 984 /* Function vect_update_misalignment_for_peel 985 986 DR - the data reference whose misalignment is to be adjusted. 987 DR_PEEL - the data reference whose misalignment is being made 988 zero in the vector loop by the peel. 989 NPEEL - the number of iterations in the peel loop if the misalignment 990 of DR_PEEL is known at compile time. */ 991 992 static void 993 vect_update_misalignment_for_peel (struct data_reference *dr, 994 struct data_reference *dr_peel, int npeel) 995 { 996 unsigned int i; 997 vec<dr_p> same_align_drs; 998 struct data_reference *current_dr; 999 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 1000 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); 1001 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1002 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1003 1004 /* For interleaved data accesses the step in the loop must be multiplied by 1005 the size of the interleaving group. */ 1006 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1007 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1008 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1009 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1010 1011 /* It can be assumed that the data refs with the same alignment as dr_peel 1012 are aligned in the vector loop. */ 1013 same_align_drs 1014 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1015 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) 1016 { 1017 if (current_dr != dr) 1018 continue; 1019 gcc_assert (DR_MISALIGNMENT (dr) / dr_size == 1020 DR_MISALIGNMENT (dr_peel) / dr_peel_size); 1021 SET_DR_MISALIGNMENT (dr, 0); 1022 return; 1023 } 1024 1025 if (known_alignment_for_access_p (dr) 1026 && known_alignment_for_access_p (dr_peel)) 1027 { 1028 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1029 int misal = DR_MISALIGNMENT (dr); 1030 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1031 misal += negative ? -npeel * dr_size : npeel * dr_size; 1032 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; 1033 SET_DR_MISALIGNMENT (dr, misal); 1034 return; 1035 } 1036 1037 if (dump_enabled_p ()) 1038 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n"); 1039 SET_DR_MISALIGNMENT (dr, -1); 1040 } 1041 1042 1043 /* Function verify_data_ref_alignment 1044 1045 Return TRUE if DR can be handled with respect to alignment. */ 1046 1047 static bool 1048 verify_data_ref_alignment (data_reference_p dr) 1049 { 1050 enum dr_alignment_support supportable_dr_alignment 1051 = vect_supportable_dr_alignment (dr, false); 1052 if (!supportable_dr_alignment) 1053 { 1054 if (dump_enabled_p ()) 1055 { 1056 if (DR_IS_READ (dr)) 1057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1058 "not vectorized: unsupported unaligned load."); 1059 else 1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1061 "not vectorized: unsupported unaligned " 1062 "store."); 1063 1064 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1065 DR_REF (dr)); 1066 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1067 } 1068 return false; 1069 } 1070 1071 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1072 dump_printf_loc (MSG_NOTE, vect_location, 1073 "Vectorizing an unaligned access.\n"); 1074 1075 return true; 1076 } 1077 1078 /* Function vect_verify_datarefs_alignment 1079 1080 Return TRUE if all data references in the loop can be 1081 handled with respect to alignment. */ 1082 1083 bool 1084 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1085 { 1086 vec<data_reference_p> datarefs = vinfo->datarefs; 1087 struct data_reference *dr; 1088 unsigned int i; 1089 1090 FOR_EACH_VEC_ELT (datarefs, i, dr) 1091 { 1092 gimple *stmt = DR_STMT (dr); 1093 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1094 1095 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1096 continue; 1097 1098 /* For interleaving, only the alignment of the first access matters. */ 1099 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1100 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1101 continue; 1102 1103 /* Strided accesses perform only component accesses, alignment is 1104 irrelevant for them. */ 1105 if (STMT_VINFO_STRIDED_P (stmt_info) 1106 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1107 continue; 1108 1109 if (! verify_data_ref_alignment (dr)) 1110 return false; 1111 } 1112 1113 return true; 1114 } 1115 1116 /* Given an memory reference EXP return whether its alignment is less 1117 than its size. */ 1118 1119 static bool 1120 not_size_aligned (tree exp) 1121 { 1122 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1123 return true; 1124 1125 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1126 > get_object_alignment (exp)); 1127 } 1128 1129 /* Function vector_alignment_reachable_p 1130 1131 Return true if vector alignment for DR is reachable by peeling 1132 a few loop iterations. Return false otherwise. */ 1133 1134 static bool 1135 vector_alignment_reachable_p (struct data_reference *dr) 1136 { 1137 gimple *stmt = DR_STMT (dr); 1138 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1139 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1140 1141 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1142 { 1143 /* For interleaved access we peel only if number of iterations in 1144 the prolog loop ({VF - misalignment}), is a multiple of the 1145 number of the interleaved accesses. */ 1146 int elem_size, mis_in_elements; 1147 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 1148 1149 /* FORNOW: handle only known alignment. */ 1150 if (!known_alignment_for_access_p (dr)) 1151 return false; 1152 1153 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; 1154 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1155 1156 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) 1157 return false; 1158 } 1159 1160 /* If misalignment is known at the compile time then allow peeling 1161 only if natural alignment is reachable through peeling. */ 1162 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1163 { 1164 HOST_WIDE_INT elmsize = 1165 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1166 if (dump_enabled_p ()) 1167 { 1168 dump_printf_loc (MSG_NOTE, vect_location, 1169 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1170 dump_printf (MSG_NOTE, 1171 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1172 } 1173 if (DR_MISALIGNMENT (dr) % elmsize) 1174 { 1175 if (dump_enabled_p ()) 1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1177 "data size does not divide the misalignment.\n"); 1178 return false; 1179 } 1180 } 1181 1182 if (!known_alignment_for_access_p (dr)) 1183 { 1184 tree type = TREE_TYPE (DR_REF (dr)); 1185 bool is_packed = not_size_aligned (DR_REF (dr)); 1186 if (dump_enabled_p ()) 1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1188 "Unknown misalignment, %snaturally aligned\n", 1189 is_packed ? "not " : ""); 1190 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1191 } 1192 1193 return true; 1194 } 1195 1196 1197 /* Calculate the cost of the memory access represented by DR. */ 1198 1199 static void 1200 vect_get_data_access_cost (struct data_reference *dr, 1201 unsigned int *inside_cost, 1202 unsigned int *outside_cost, 1203 stmt_vector_for_cost *body_cost_vec) 1204 { 1205 gimple *stmt = DR_STMT (dr); 1206 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1207 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1208 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1209 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1210 int ncopies = vf / nunits; 1211 1212 if (DR_IS_READ (dr)) 1213 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1214 NULL, body_cost_vec, false); 1215 else 1216 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1217 1218 if (dump_enabled_p ()) 1219 dump_printf_loc (MSG_NOTE, vect_location, 1220 "vect_get_data_access_cost: inside_cost = %d, " 1221 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1222 } 1223 1224 1225 typedef struct _vect_peel_info 1226 { 1227 struct data_reference *dr; 1228 int npeel; 1229 unsigned int count; 1230 } *vect_peel_info; 1231 1232 typedef struct _vect_peel_extended_info 1233 { 1234 struct _vect_peel_info peel_info; 1235 unsigned int inside_cost; 1236 unsigned int outside_cost; 1237 stmt_vector_for_cost body_cost_vec; 1238 } *vect_peel_extended_info; 1239 1240 1241 /* Peeling hashtable helpers. */ 1242 1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1244 { 1245 static inline hashval_t hash (const _vect_peel_info *); 1246 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1247 }; 1248 1249 inline hashval_t 1250 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1251 { 1252 return (hashval_t) peel_info->npeel; 1253 } 1254 1255 inline bool 1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1257 { 1258 return (a->npeel == b->npeel); 1259 } 1260 1261 1262 /* Insert DR into peeling hash table with NPEEL as key. */ 1263 1264 static void 1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1266 loop_vec_info loop_vinfo, struct data_reference *dr, 1267 int npeel) 1268 { 1269 struct _vect_peel_info elem, *slot; 1270 _vect_peel_info **new_slot; 1271 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1272 1273 elem.npeel = npeel; 1274 slot = peeling_htab->find (&elem); 1275 if (slot) 1276 slot->count++; 1277 else 1278 { 1279 slot = XNEW (struct _vect_peel_info); 1280 slot->npeel = npeel; 1281 slot->dr = dr; 1282 slot->count = 1; 1283 new_slot = peeling_htab->find_slot (slot, INSERT); 1284 *new_slot = slot; 1285 } 1286 1287 if (!supportable_dr_alignment 1288 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1289 slot->count += VECT_MAX_COST; 1290 } 1291 1292 1293 /* Traverse peeling hash table to find peeling option that aligns maximum 1294 number of data accesses. */ 1295 1296 int 1297 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1298 _vect_peel_extended_info *max) 1299 { 1300 vect_peel_info elem = *slot; 1301 1302 if (elem->count > max->peel_info.count 1303 || (elem->count == max->peel_info.count 1304 && max->peel_info.npeel > elem->npeel)) 1305 { 1306 max->peel_info.npeel = elem->npeel; 1307 max->peel_info.count = elem->count; 1308 max->peel_info.dr = elem->dr; 1309 } 1310 1311 return 1; 1312 } 1313 1314 1315 /* Traverse peeling hash table and calculate cost for each peeling option. 1316 Find the one with the lowest cost. */ 1317 1318 int 1319 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1320 _vect_peel_extended_info *min) 1321 { 1322 vect_peel_info elem = *slot; 1323 int save_misalignment, dummy; 1324 unsigned int inside_cost = 0, outside_cost = 0, i; 1325 gimple *stmt = DR_STMT (elem->dr); 1326 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1327 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1328 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1329 struct data_reference *dr; 1330 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; 1331 1332 prologue_cost_vec.create (2); 1333 body_cost_vec.create (2); 1334 epilogue_cost_vec.create (2); 1335 1336 FOR_EACH_VEC_ELT (datarefs, i, dr) 1337 { 1338 stmt = DR_STMT (dr); 1339 stmt_info = vinfo_for_stmt (stmt); 1340 /* For interleaving, only the alignment of the first access 1341 matters. */ 1342 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1343 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1344 continue; 1345 1346 /* Strided accesses perform only component accesses, alignment is 1347 irrelevant for them. */ 1348 if (STMT_VINFO_STRIDED_P (stmt_info) 1349 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1350 continue; 1351 1352 save_misalignment = DR_MISALIGNMENT (dr); 1353 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); 1354 vect_get_data_access_cost (dr, &inside_cost, &outside_cost, 1355 &body_cost_vec); 1356 SET_DR_MISALIGNMENT (dr, save_misalignment); 1357 } 1358 1359 outside_cost += vect_get_known_peeling_cost 1360 (loop_vinfo, elem->npeel, &dummy, 1361 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1362 &prologue_cost_vec, &epilogue_cost_vec); 1363 1364 /* Prologue and epilogue costs are added to the target model later. 1365 These costs depend only on the scalar iteration cost, the 1366 number of peeling iterations finally chosen, and the number of 1367 misaligned statements. So discard the information found here. */ 1368 prologue_cost_vec.release (); 1369 epilogue_cost_vec.release (); 1370 1371 if (inside_cost < min->inside_cost 1372 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) 1373 { 1374 min->inside_cost = inside_cost; 1375 min->outside_cost = outside_cost; 1376 min->body_cost_vec.release (); 1377 min->body_cost_vec = body_cost_vec; 1378 min->peel_info.dr = elem->dr; 1379 min->peel_info.npeel = elem->npeel; 1380 } 1381 else 1382 body_cost_vec.release (); 1383 1384 return 1; 1385 } 1386 1387 1388 /* Choose best peeling option by traversing peeling hash table and either 1389 choosing an option with the lowest cost (if cost model is enabled) or the 1390 option that aligns as many accesses as possible. */ 1391 1392 static struct data_reference * 1393 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1394 loop_vec_info loop_vinfo, 1395 unsigned int *npeel, 1396 stmt_vector_for_cost *body_cost_vec) 1397 { 1398 struct _vect_peel_extended_info res; 1399 1400 res.peel_info.dr = NULL; 1401 res.body_cost_vec = stmt_vector_for_cost (); 1402 1403 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1404 { 1405 res.inside_cost = INT_MAX; 1406 res.outside_cost = INT_MAX; 1407 peeling_htab->traverse <_vect_peel_extended_info *, 1408 vect_peeling_hash_get_lowest_cost> (&res); 1409 } 1410 else 1411 { 1412 res.peel_info.count = 0; 1413 peeling_htab->traverse <_vect_peel_extended_info *, 1414 vect_peeling_hash_get_most_frequent> (&res); 1415 } 1416 1417 *npeel = res.peel_info.npeel; 1418 *body_cost_vec = res.body_cost_vec; 1419 return res.peel_info.dr; 1420 } 1421 1422 1423 /* Function vect_enhance_data_refs_alignment 1424 1425 This pass will use loop versioning and loop peeling in order to enhance 1426 the alignment of data references in the loop. 1427 1428 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1429 original loop is to be vectorized. Any other loops that are created by 1430 the transformations performed in this pass - are not supposed to be 1431 vectorized. This restriction will be relaxed. 1432 1433 This pass will require a cost model to guide it whether to apply peeling 1434 or versioning or a combination of the two. For example, the scheme that 1435 intel uses when given a loop with several memory accesses, is as follows: 1436 choose one memory access ('p') which alignment you want to force by doing 1437 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1438 other accesses are not necessarily aligned, or (2) use loop versioning to 1439 generate one loop in which all accesses are aligned, and another loop in 1440 which only 'p' is necessarily aligned. 1441 1442 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1443 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1444 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1445 1446 Devising a cost model is the most critical aspect of this work. It will 1447 guide us on which access to peel for, whether to use loop versioning, how 1448 many versions to create, etc. The cost model will probably consist of 1449 generic considerations as well as target specific considerations (on 1450 powerpc for example, misaligned stores are more painful than misaligned 1451 loads). 1452 1453 Here are the general steps involved in alignment enhancements: 1454 1455 -- original loop, before alignment analysis: 1456 for (i=0; i<N; i++){ 1457 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1458 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1459 } 1460 1461 -- After vect_compute_data_refs_alignment: 1462 for (i=0; i<N; i++){ 1463 x = q[i]; # DR_MISALIGNMENT(q) = 3 1464 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1465 } 1466 1467 -- Possibility 1: we do loop versioning: 1468 if (p is aligned) { 1469 for (i=0; i<N; i++){ # loop 1A 1470 x = q[i]; # DR_MISALIGNMENT(q) = 3 1471 p[i] = y; # DR_MISALIGNMENT(p) = 0 1472 } 1473 } 1474 else { 1475 for (i=0; i<N; i++){ # loop 1B 1476 x = q[i]; # DR_MISALIGNMENT(q) = 3 1477 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1478 } 1479 } 1480 1481 -- Possibility 2: we do loop peeling: 1482 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1483 x = q[i]; 1484 p[i] = y; 1485 } 1486 for (i = 3; i < N; i++){ # loop 2A 1487 x = q[i]; # DR_MISALIGNMENT(q) = 0 1488 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1489 } 1490 1491 -- Possibility 3: combination of loop peeling and versioning: 1492 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1493 x = q[i]; 1494 p[i] = y; 1495 } 1496 if (p is aligned) { 1497 for (i = 3; i<N; i++){ # loop 3A 1498 x = q[i]; # DR_MISALIGNMENT(q) = 0 1499 p[i] = y; # DR_MISALIGNMENT(p) = 0 1500 } 1501 } 1502 else { 1503 for (i = 3; i<N; i++){ # loop 3B 1504 x = q[i]; # DR_MISALIGNMENT(q) = 0 1505 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1506 } 1507 } 1508 1509 These loops are later passed to loop_transform to be vectorized. The 1510 vectorizer will use the alignment information to guide the transformation 1511 (whether to generate regular loads/stores, or with special handling for 1512 misalignment). */ 1513 1514 bool 1515 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1516 { 1517 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1519 enum dr_alignment_support supportable_dr_alignment; 1520 struct data_reference *dr0 = NULL, *first_store = NULL; 1521 struct data_reference *dr; 1522 unsigned int i, j; 1523 bool do_peeling = false; 1524 bool do_versioning = false; 1525 bool stat; 1526 gimple *stmt; 1527 stmt_vec_info stmt_info; 1528 unsigned int npeel = 0; 1529 bool all_misalignments_unknown = true; 1530 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1531 unsigned possible_npeel_number = 1; 1532 tree vectype; 1533 unsigned int nelements, mis, same_align_drs_max = 0; 1534 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); 1535 hash_table<peel_info_hasher> peeling_htab (1); 1536 1537 if (dump_enabled_p ()) 1538 dump_printf_loc (MSG_NOTE, vect_location, 1539 "=== vect_enhance_data_refs_alignment ===\n"); 1540 1541 /* Reset data so we can safely be called multiple times. */ 1542 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1543 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1544 1545 /* While cost model enhancements are expected in the future, the high level 1546 view of the code at this time is as follows: 1547 1548 A) If there is a misaligned access then see if peeling to align 1549 this access can make all data references satisfy 1550 vect_supportable_dr_alignment. If so, update data structures 1551 as needed and return true. 1552 1553 B) If peeling wasn't possible and there is a data reference with an 1554 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1555 then see if loop versioning checks can be used to make all data 1556 references satisfy vect_supportable_dr_alignment. If so, update 1557 data structures as needed and return true. 1558 1559 C) If neither peeling nor versioning were successful then return false if 1560 any data reference does not satisfy vect_supportable_dr_alignment. 1561 1562 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1563 1564 Note, Possibility 3 above (which is peeling and versioning together) is not 1565 being done at this time. */ 1566 1567 /* (1) Peeling to force alignment. */ 1568 1569 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1570 Considerations: 1571 + How many accesses will become aligned due to the peeling 1572 - How many accesses will become unaligned due to the peeling, 1573 and the cost of misaligned accesses. 1574 - The cost of peeling (the extra runtime checks, the increase 1575 in code size). */ 1576 1577 FOR_EACH_VEC_ELT (datarefs, i, dr) 1578 { 1579 stmt = DR_STMT (dr); 1580 stmt_info = vinfo_for_stmt (stmt); 1581 1582 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1583 continue; 1584 1585 /* For interleaving, only the alignment of the first access 1586 matters. */ 1587 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1588 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1589 continue; 1590 1591 /* For invariant accesses there is nothing to enhance. */ 1592 if (integer_zerop (DR_STEP (dr))) 1593 continue; 1594 1595 /* Strided accesses perform only component accesses, alignment is 1596 irrelevant for them. */ 1597 if (STMT_VINFO_STRIDED_P (stmt_info) 1598 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1599 continue; 1600 1601 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1602 do_peeling = vector_alignment_reachable_p (dr); 1603 if (do_peeling) 1604 { 1605 if (known_alignment_for_access_p (dr)) 1606 { 1607 unsigned int npeel_tmp; 1608 bool negative = tree_int_cst_compare (DR_STEP (dr), 1609 size_zero_node) < 0; 1610 1611 /* Save info about DR in the hash table. */ 1612 vectype = STMT_VINFO_VECTYPE (stmt_info); 1613 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1614 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( 1615 TREE_TYPE (DR_REF (dr)))); 1616 npeel_tmp = (negative 1617 ? (mis - nelements) : (nelements - mis)) 1618 & (nelements - 1); 1619 1620 /* For multiple types, it is possible that the bigger type access 1621 will have more than one peeling option. E.g., a loop with two 1622 types: one of size (vector size / 4), and the other one of 1623 size (vector size / 8). Vectorization factor will 8. If both 1624 access are misaligned by 3, the first one needs one scalar 1625 iteration to be aligned, and the second one needs 5. But the 1626 first one will be aligned also by peeling 5 scalar 1627 iterations, and in that case both accesses will be aligned. 1628 Hence, except for the immediate peeling amount, we also want 1629 to try to add full vector size, while we don't exceed 1630 vectorization factor. 1631 We do this automatically for cost model, since we calculate cost 1632 for every peeling option. */ 1633 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1634 { 1635 if (STMT_SLP_TYPE (stmt_info)) 1636 possible_npeel_number 1637 = (vf * GROUP_SIZE (stmt_info)) / nelements; 1638 else 1639 possible_npeel_number = vf / nelements; 1640 } 1641 1642 /* Handle the aligned case. We may decide to align some other 1643 access, making DR unaligned. */ 1644 if (DR_MISALIGNMENT (dr) == 0) 1645 { 1646 npeel_tmp = 0; 1647 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1648 possible_npeel_number++; 1649 } 1650 1651 for (j = 0; j < possible_npeel_number; j++) 1652 { 1653 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1654 dr, npeel_tmp); 1655 npeel_tmp += nelements; 1656 } 1657 1658 all_misalignments_unknown = false; 1659 /* Data-ref that was chosen for the case that all the 1660 misalignments are unknown is not relevant anymore, since we 1661 have a data-ref with known alignment. */ 1662 dr0 = NULL; 1663 } 1664 else 1665 { 1666 /* If we don't know any misalignment values, we prefer 1667 peeling for data-ref that has the maximum number of data-refs 1668 with the same alignment, unless the target prefers to align 1669 stores over load. */ 1670 if (all_misalignments_unknown) 1671 { 1672 unsigned same_align_drs 1673 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1674 if (!dr0 1675 || same_align_drs_max < same_align_drs) 1676 { 1677 same_align_drs_max = same_align_drs; 1678 dr0 = dr; 1679 } 1680 /* For data-refs with the same number of related 1681 accesses prefer the one where the misalign 1682 computation will be invariant in the outermost loop. */ 1683 else if (same_align_drs_max == same_align_drs) 1684 { 1685 struct loop *ivloop0, *ivloop; 1686 ivloop0 = outermost_invariant_loop_for_expr 1687 (loop, DR_BASE_ADDRESS (dr0)); 1688 ivloop = outermost_invariant_loop_for_expr 1689 (loop, DR_BASE_ADDRESS (dr)); 1690 if ((ivloop && !ivloop0) 1691 || (ivloop && ivloop0 1692 && flow_loop_nested_p (ivloop, ivloop0))) 1693 dr0 = dr; 1694 } 1695 1696 if (!first_store && DR_IS_WRITE (dr)) 1697 first_store = dr; 1698 } 1699 1700 /* If there are both known and unknown misaligned accesses in the 1701 loop, we choose peeling amount according to the known 1702 accesses. */ 1703 if (!supportable_dr_alignment) 1704 { 1705 dr0 = dr; 1706 if (!first_store && DR_IS_WRITE (dr)) 1707 first_store = dr; 1708 } 1709 } 1710 } 1711 else 1712 { 1713 if (!aligned_access_p (dr)) 1714 { 1715 if (dump_enabled_p ()) 1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1717 "vector alignment may not be reachable\n"); 1718 break; 1719 } 1720 } 1721 } 1722 1723 /* Check if we can possibly peel the loop. */ 1724 if (!vect_can_advance_ivs_p (loop_vinfo) 1725 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1726 || loop->inner) 1727 do_peeling = false; 1728 1729 if (do_peeling 1730 && all_misalignments_unknown 1731 && vect_supportable_dr_alignment (dr0, false)) 1732 { 1733 /* Check if the target requires to prefer stores over loads, i.e., if 1734 misaligned stores are more expensive than misaligned loads (taking 1735 drs with same alignment into account). */ 1736 if (first_store && DR_IS_READ (dr0)) 1737 { 1738 unsigned int load_inside_cost = 0, load_outside_cost = 0; 1739 unsigned int store_inside_cost = 0, store_outside_cost = 0; 1740 unsigned int load_inside_penalty = 0, load_outside_penalty = 0; 1741 unsigned int store_inside_penalty = 0, store_outside_penalty = 0; 1742 stmt_vector_for_cost dummy; 1743 dummy.create (2); 1744 1745 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, 1746 &dummy); 1747 vect_get_data_access_cost (first_store, &store_inside_cost, 1748 &store_outside_cost, &dummy); 1749 1750 dummy.release (); 1751 1752 /* Calculate the penalty for leaving FIRST_STORE unaligned (by 1753 aligning the load DR0). */ 1754 load_inside_penalty = store_inside_cost; 1755 load_outside_penalty = store_outside_cost; 1756 for (i = 0; 1757 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1758 DR_STMT (first_store))).iterate (i, &dr); 1759 i++) 1760 if (DR_IS_READ (dr)) 1761 { 1762 load_inside_penalty += load_inside_cost; 1763 load_outside_penalty += load_outside_cost; 1764 } 1765 else 1766 { 1767 load_inside_penalty += store_inside_cost; 1768 load_outside_penalty += store_outside_cost; 1769 } 1770 1771 /* Calculate the penalty for leaving DR0 unaligned (by 1772 aligning the FIRST_STORE). */ 1773 store_inside_penalty = load_inside_cost; 1774 store_outside_penalty = load_outside_cost; 1775 for (i = 0; 1776 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1777 DR_STMT (dr0))).iterate (i, &dr); 1778 i++) 1779 if (DR_IS_READ (dr)) 1780 { 1781 store_inside_penalty += load_inside_cost; 1782 store_outside_penalty += load_outside_cost; 1783 } 1784 else 1785 { 1786 store_inside_penalty += store_inside_cost; 1787 store_outside_penalty += store_outside_cost; 1788 } 1789 1790 if (load_inside_penalty > store_inside_penalty 1791 || (load_inside_penalty == store_inside_penalty 1792 && load_outside_penalty > store_outside_penalty)) 1793 dr0 = first_store; 1794 } 1795 1796 /* In case there are only loads with different unknown misalignments, use 1797 peeling only if it may help to align other accesses in the loop or 1798 if it may help improving load bandwith when we'd end up using 1799 unaligned loads. */ 1800 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0))); 1801 if (!first_store 1802 && !STMT_VINFO_SAME_ALIGN_REFS ( 1803 vinfo_for_stmt (DR_STMT (dr0))).length () 1804 && (vect_supportable_dr_alignment (dr0, false) 1805 != dr_unaligned_supported 1806 || (builtin_vectorization_cost (vector_load, dr0_vt, 0) 1807 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1)))) 1808 do_peeling = false; 1809 } 1810 1811 if (do_peeling && !dr0) 1812 { 1813 /* Peeling is possible, but there is no data access that is not supported 1814 unless aligned. So we try to choose the best possible peeling. */ 1815 1816 /* We should get here only if there are drs with known misalignment. */ 1817 gcc_assert (!all_misalignments_unknown); 1818 1819 /* Choose the best peeling from the hash table. */ 1820 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab, 1821 loop_vinfo, &npeel, 1822 &body_cost_vec); 1823 if (!dr0 || !npeel) 1824 do_peeling = false; 1825 } 1826 1827 if (do_peeling) 1828 { 1829 stmt = DR_STMT (dr0); 1830 stmt_info = vinfo_for_stmt (stmt); 1831 vectype = STMT_VINFO_VECTYPE (stmt_info); 1832 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1833 1834 if (known_alignment_for_access_p (dr0)) 1835 { 1836 bool negative = tree_int_cst_compare (DR_STEP (dr0), 1837 size_zero_node) < 0; 1838 if (!npeel) 1839 { 1840 /* Since it's known at compile time, compute the number of 1841 iterations in the peeled loop (the peeling factor) for use in 1842 updating DR_MISALIGNMENT values. The peeling factor is the 1843 vectorization factor minus the misalignment as an element 1844 count. */ 1845 mis = DR_MISALIGNMENT (dr0); 1846 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); 1847 npeel = ((negative ? mis - nelements : nelements - mis) 1848 & (nelements - 1)); 1849 } 1850 1851 /* For interleaved data access every iteration accesses all the 1852 members of the group, therefore we divide the number of iterations 1853 by the group size. */ 1854 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 1855 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1856 npeel /= GROUP_SIZE (stmt_info); 1857 1858 if (dump_enabled_p ()) 1859 dump_printf_loc (MSG_NOTE, vect_location, 1860 "Try peeling by %d\n", npeel); 1861 } 1862 1863 /* Ensure that all data refs can be vectorized after the peel. */ 1864 FOR_EACH_VEC_ELT (datarefs, i, dr) 1865 { 1866 int save_misalignment; 1867 1868 if (dr == dr0) 1869 continue; 1870 1871 stmt = DR_STMT (dr); 1872 stmt_info = vinfo_for_stmt (stmt); 1873 /* For interleaving, only the alignment of the first access 1874 matters. */ 1875 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1876 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1877 continue; 1878 1879 /* Strided accesses perform only component accesses, alignment is 1880 irrelevant for them. */ 1881 if (STMT_VINFO_STRIDED_P (stmt_info) 1882 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1883 continue; 1884 1885 save_misalignment = DR_MISALIGNMENT (dr); 1886 vect_update_misalignment_for_peel (dr, dr0, npeel); 1887 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1888 SET_DR_MISALIGNMENT (dr, save_misalignment); 1889 1890 if (!supportable_dr_alignment) 1891 { 1892 do_peeling = false; 1893 break; 1894 } 1895 } 1896 1897 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 1898 { 1899 stat = vect_verify_datarefs_alignment (loop_vinfo); 1900 if (!stat) 1901 do_peeling = false; 1902 else 1903 { 1904 body_cost_vec.release (); 1905 return stat; 1906 } 1907 } 1908 1909 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 1910 if (do_peeling) 1911 { 1912 unsigned max_allowed_peel 1913 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 1914 if (max_allowed_peel != (unsigned)-1) 1915 { 1916 unsigned max_peel = npeel; 1917 if (max_peel == 0) 1918 { 1919 gimple *dr_stmt = DR_STMT (dr0); 1920 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt); 1921 tree vtype = STMT_VINFO_VECTYPE (vinfo); 1922 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1; 1923 } 1924 if (max_peel > max_allowed_peel) 1925 { 1926 do_peeling = false; 1927 if (dump_enabled_p ()) 1928 dump_printf_loc (MSG_NOTE, vect_location, 1929 "Disable peeling, max peels reached: %d\n", max_peel); 1930 } 1931 } 1932 } 1933 1934 /* Cost model #2 - if peeling may result in a remaining loop not 1935 iterating enough to be vectorized then do not peel. */ 1936 if (do_peeling 1937 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1938 { 1939 unsigned max_peel 1940 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel; 1941 if (LOOP_VINFO_INT_NITERS (loop_vinfo) 1942 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel) 1943 do_peeling = false; 1944 } 1945 1946 if (do_peeling) 1947 { 1948 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 1949 If the misalignment of DR_i is identical to that of dr0 then set 1950 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 1951 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 1952 by the peeling factor times the element size of DR_i (MOD the 1953 vectorization factor times the size). Otherwise, the 1954 misalignment of DR_i must be set to unknown. */ 1955 FOR_EACH_VEC_ELT (datarefs, i, dr) 1956 if (dr != dr0) 1957 { 1958 /* Strided accesses perform only component accesses, alignment 1959 is irrelevant for them. */ 1960 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1961 if (STMT_VINFO_STRIDED_P (stmt_info) 1962 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1963 continue; 1964 1965 vect_update_misalignment_for_peel (dr, dr0, npeel); 1966 } 1967 1968 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 1969 if (npeel) 1970 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 1971 else 1972 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1973 = DR_MISALIGNMENT (dr0); 1974 SET_DR_MISALIGNMENT (dr0, 0); 1975 if (dump_enabled_p ()) 1976 { 1977 dump_printf_loc (MSG_NOTE, vect_location, 1978 "Alignment of access forced using peeling.\n"); 1979 dump_printf_loc (MSG_NOTE, vect_location, 1980 "Peeling for alignment will be applied.\n"); 1981 } 1982 /* The inside-loop cost will be accounted for in vectorizable_load 1983 and vectorizable_store correctly with adjusted alignments. 1984 Drop the body_cst_vec on the floor here. */ 1985 body_cost_vec.release (); 1986 1987 stat = vect_verify_datarefs_alignment (loop_vinfo); 1988 gcc_assert (stat); 1989 return stat; 1990 } 1991 } 1992 1993 body_cost_vec.release (); 1994 1995 /* (2) Versioning to force alignment. */ 1996 1997 /* Try versioning if: 1998 1) optimize loop for speed 1999 2) there is at least one unsupported misaligned data ref with an unknown 2000 misalignment, and 2001 3) all misaligned data refs with a known misalignment are supported, and 2002 4) the number of runtime alignment checks is within reason. */ 2003 2004 do_versioning = 2005 optimize_loop_nest_for_speed_p (loop) 2006 && (!loop->inner); /* FORNOW */ 2007 2008 if (do_versioning) 2009 { 2010 FOR_EACH_VEC_ELT (datarefs, i, dr) 2011 { 2012 stmt = DR_STMT (dr); 2013 stmt_info = vinfo_for_stmt (stmt); 2014 2015 /* For interleaving, only the alignment of the first access 2016 matters. */ 2017 if (aligned_access_p (dr) 2018 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2019 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2020 continue; 2021 2022 if (STMT_VINFO_STRIDED_P (stmt_info)) 2023 { 2024 /* Strided loads perform only component accesses, alignment is 2025 irrelevant for them. */ 2026 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2027 continue; 2028 do_versioning = false; 2029 break; 2030 } 2031 2032 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2033 2034 if (!supportable_dr_alignment) 2035 { 2036 gimple *stmt; 2037 int mask; 2038 tree vectype; 2039 2040 if (known_alignment_for_access_p (dr) 2041 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2042 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2043 { 2044 do_versioning = false; 2045 break; 2046 } 2047 2048 stmt = DR_STMT (dr); 2049 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2050 gcc_assert (vectype); 2051 2052 /* The rightmost bits of an aligned address must be zeros. 2053 Construct the mask needed for this test. For example, 2054 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2055 mask must be 15 = 0xf. */ 2056 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; 2057 2058 /* FORNOW: use the same mask to test all potentially unaligned 2059 references in the loop. The vectorizer currently supports 2060 a single vector size, see the reference to 2061 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2062 vectorization factor is computed. */ 2063 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2064 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2065 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2066 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2067 DR_STMT (dr)); 2068 } 2069 } 2070 2071 /* Versioning requires at least one misaligned data reference. */ 2072 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2073 do_versioning = false; 2074 else if (!do_versioning) 2075 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2076 } 2077 2078 if (do_versioning) 2079 { 2080 vec<gimple *> may_misalign_stmts 2081 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2082 gimple *stmt; 2083 2084 /* It can now be assumed that the data references in the statements 2085 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2086 of the loop being vectorized. */ 2087 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2088 { 2089 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2090 dr = STMT_VINFO_DATA_REF (stmt_info); 2091 SET_DR_MISALIGNMENT (dr, 0); 2092 if (dump_enabled_p ()) 2093 dump_printf_loc (MSG_NOTE, vect_location, 2094 "Alignment of access forced using versioning.\n"); 2095 } 2096 2097 if (dump_enabled_p ()) 2098 dump_printf_loc (MSG_NOTE, vect_location, 2099 "Versioning for alignment will be applied.\n"); 2100 2101 /* Peeling and versioning can't be done together at this time. */ 2102 gcc_assert (! (do_peeling && do_versioning)); 2103 2104 stat = vect_verify_datarefs_alignment (loop_vinfo); 2105 gcc_assert (stat); 2106 return stat; 2107 } 2108 2109 /* This point is reached if neither peeling nor versioning is being done. */ 2110 gcc_assert (! (do_peeling || do_versioning)); 2111 2112 stat = vect_verify_datarefs_alignment (loop_vinfo); 2113 return stat; 2114 } 2115 2116 2117 /* Function vect_find_same_alignment_drs. 2118 2119 Update group and alignment relations according to the chosen 2120 vectorization factor. */ 2121 2122 static void 2123 vect_find_same_alignment_drs (struct data_dependence_relation *ddr, 2124 loop_vec_info loop_vinfo) 2125 { 2126 unsigned int i; 2127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2128 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2129 struct data_reference *dra = DDR_A (ddr); 2130 struct data_reference *drb = DDR_B (ddr); 2131 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2132 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2133 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); 2134 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); 2135 lambda_vector dist_v; 2136 unsigned int loop_depth; 2137 2138 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2139 return; 2140 2141 if (dra == drb) 2142 return; 2143 2144 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 2145 return; 2146 2147 /* Loop-based vectorization and known data dependence. */ 2148 if (DDR_NUM_DIST_VECTS (ddr) == 0) 2149 return; 2150 2151 /* Data-dependence analysis reports a distance vector of zero 2152 for data-references that overlap only in the first iteration 2153 but have different sign step (see PR45764). 2154 So as a sanity check require equal DR_STEP. */ 2155 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2156 return; 2157 2158 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 2159 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 2160 { 2161 int dist = dist_v[loop_depth]; 2162 2163 if (dump_enabled_p ()) 2164 dump_printf_loc (MSG_NOTE, vect_location, 2165 "dependence distance = %d.\n", dist); 2166 2167 /* Same loop iteration. */ 2168 if (dist == 0 2169 || (dist % vectorization_factor == 0 && dra_size == drb_size)) 2170 { 2171 /* Two references with distance zero have the same alignment. */ 2172 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2173 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2174 if (dump_enabled_p ()) 2175 { 2176 dump_printf_loc (MSG_NOTE, vect_location, 2177 "accesses have the same alignment.\n"); 2178 dump_printf (MSG_NOTE, 2179 "dependence distance modulo vf == 0 between "); 2180 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2181 dump_printf (MSG_NOTE, " and "); 2182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2183 dump_printf (MSG_NOTE, "\n"); 2184 } 2185 } 2186 } 2187 } 2188 2189 2190 /* Function vect_analyze_data_refs_alignment 2191 2192 Analyze the alignment of the data-references in the loop. 2193 Return FALSE if a data reference is found that cannot be vectorized. */ 2194 2195 bool 2196 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2197 { 2198 if (dump_enabled_p ()) 2199 dump_printf_loc (MSG_NOTE, vect_location, 2200 "=== vect_analyze_data_refs_alignment ===\n"); 2201 2202 /* Mark groups of data references with same alignment using 2203 data dependence information. */ 2204 vec<ddr_p> ddrs = vinfo->ddrs; 2205 struct data_dependence_relation *ddr; 2206 unsigned int i; 2207 2208 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2209 vect_find_same_alignment_drs (ddr, vinfo); 2210 2211 vec<data_reference_p> datarefs = vinfo->datarefs; 2212 struct data_reference *dr; 2213 2214 FOR_EACH_VEC_ELT (datarefs, i, dr) 2215 { 2216 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2217 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2218 && !vect_compute_data_ref_alignment (dr)) 2219 { 2220 /* Strided accesses perform only component accesses, misalignment 2221 information is irrelevant for them. */ 2222 if (STMT_VINFO_STRIDED_P (stmt_info) 2223 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2224 continue; 2225 2226 if (dump_enabled_p ()) 2227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2228 "not vectorized: can't calculate alignment " 2229 "for data ref.\n"); 2230 2231 return false; 2232 } 2233 } 2234 2235 return true; 2236 } 2237 2238 2239 /* Analyze alignment of DRs of stmts in NODE. */ 2240 2241 static bool 2242 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2243 { 2244 /* We vectorize from the first scalar stmt in the node unless 2245 the node is permuted in which case we start from the first 2246 element in the group. */ 2247 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2248 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2249 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2250 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2251 2252 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2253 if (! vect_compute_data_ref_alignment (dr) 2254 /* For creating the data-ref pointer we need alignment of the 2255 first element anyway. */ 2256 || (dr != first_dr 2257 && ! vect_compute_data_ref_alignment (first_dr)) 2258 || ! verify_data_ref_alignment (dr)) 2259 { 2260 if (dump_enabled_p ()) 2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2262 "not vectorized: bad data alignment in basic " 2263 "block.\n"); 2264 return false; 2265 } 2266 2267 return true; 2268 } 2269 2270 /* Function vect_slp_analyze_instance_alignment 2271 2272 Analyze the alignment of the data-references in the SLP instance. 2273 Return FALSE if a data reference is found that cannot be vectorized. */ 2274 2275 bool 2276 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2277 { 2278 if (dump_enabled_p ()) 2279 dump_printf_loc (MSG_NOTE, vect_location, 2280 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2281 2282 slp_tree node; 2283 unsigned i; 2284 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2285 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2286 return false; 2287 2288 node = SLP_INSTANCE_TREE (instance); 2289 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2290 && ! vect_slp_analyze_and_verify_node_alignment 2291 (SLP_INSTANCE_TREE (instance))) 2292 return false; 2293 2294 return true; 2295 } 2296 2297 2298 /* Analyze groups of accesses: check that DR belongs to a group of 2299 accesses of legal size, step, etc. Detect gaps, single element 2300 interleaving, and other special cases. Set grouped access info. 2301 Collect groups of strided stores for further use in SLP analysis. 2302 Worker for vect_analyze_group_access. */ 2303 2304 static bool 2305 vect_analyze_group_access_1 (struct data_reference *dr) 2306 { 2307 tree step = DR_STEP (dr); 2308 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2309 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2310 gimple *stmt = DR_STMT (dr); 2311 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2312 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2313 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2314 HOST_WIDE_INT dr_step = -1; 2315 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2316 bool slp_impossible = false; 2317 2318 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2319 size of the interleaving group (including gaps). */ 2320 if (tree_fits_shwi_p (step)) 2321 { 2322 dr_step = tree_to_shwi (step); 2323 /* Check that STEP is a multiple of type size. Otherwise there is 2324 a non-element-sized gap at the end of the group which we 2325 cannot represent in GROUP_GAP or GROUP_SIZE. 2326 ??? As we can handle non-constant step fine here we should 2327 simply remove uses of GROUP_GAP between the last and first 2328 element and instead rely on DR_STEP. GROUP_SIZE then would 2329 simply not include that gap. */ 2330 if ((dr_step % type_size) != 0) 2331 { 2332 if (dump_enabled_p ()) 2333 { 2334 dump_printf_loc (MSG_NOTE, vect_location, 2335 "Step "); 2336 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2337 dump_printf (MSG_NOTE, 2338 " is not a multiple of the element size for "); 2339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2340 dump_printf (MSG_NOTE, "\n"); 2341 } 2342 return false; 2343 } 2344 groupsize = absu_hwi (dr_step) / type_size; 2345 } 2346 else 2347 groupsize = 0; 2348 2349 /* Not consecutive access is possible only if it is a part of interleaving. */ 2350 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2351 { 2352 /* Check if it this DR is a part of interleaving, and is a single 2353 element of the group that is accessed in the loop. */ 2354 2355 /* Gaps are supported only for loads. STEP must be a multiple of the type 2356 size. The size of the group must be a power of 2. */ 2357 if (DR_IS_READ (dr) 2358 && (dr_step % type_size) == 0 2359 && groupsize > 0 2360 && pow2p_hwi (groupsize)) 2361 { 2362 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2363 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2364 GROUP_GAP (stmt_info) = groupsize - 1; 2365 if (dump_enabled_p ()) 2366 { 2367 dump_printf_loc (MSG_NOTE, vect_location, 2368 "Detected single element interleaving "); 2369 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2370 dump_printf (MSG_NOTE, " step "); 2371 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2372 dump_printf (MSG_NOTE, "\n"); 2373 } 2374 2375 return true; 2376 } 2377 2378 if (dump_enabled_p ()) 2379 { 2380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2381 "not consecutive access "); 2382 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2383 } 2384 2385 if (bb_vinfo) 2386 { 2387 /* Mark the statement as unvectorizable. */ 2388 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2389 return true; 2390 } 2391 2392 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2393 STMT_VINFO_STRIDED_P (stmt_info) = true; 2394 return true; 2395 } 2396 2397 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2398 { 2399 /* First stmt in the interleaving chain. Check the chain. */ 2400 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2401 struct data_reference *data_ref = dr; 2402 unsigned int count = 1; 2403 tree prev_init = DR_INIT (data_ref); 2404 gimple *prev = stmt; 2405 HOST_WIDE_INT diff, gaps = 0; 2406 2407 while (next) 2408 { 2409 /* Skip same data-refs. In case that two or more stmts share 2410 data-ref (supported only for loads), we vectorize only the first 2411 stmt, and the rest get their vectorized loads from the first 2412 one. */ 2413 if (!tree_int_cst_compare (DR_INIT (data_ref), 2414 DR_INIT (STMT_VINFO_DATA_REF ( 2415 vinfo_for_stmt (next))))) 2416 { 2417 if (DR_IS_WRITE (data_ref)) 2418 { 2419 if (dump_enabled_p ()) 2420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2421 "Two store stmts share the same dr.\n"); 2422 return false; 2423 } 2424 2425 if (dump_enabled_p ()) 2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2427 "Two or more load stmts share the same dr.\n"); 2428 2429 /* For load use the same data-ref load. */ 2430 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2431 2432 prev = next; 2433 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2434 continue; 2435 } 2436 2437 prev = next; 2438 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2439 2440 /* All group members have the same STEP by construction. */ 2441 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2442 2443 /* Check that the distance between two accesses is equal to the type 2444 size. Otherwise, we have gaps. */ 2445 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2446 - TREE_INT_CST_LOW (prev_init)) / type_size; 2447 if (diff != 1) 2448 { 2449 /* FORNOW: SLP of accesses with gaps is not supported. */ 2450 slp_impossible = true; 2451 if (DR_IS_WRITE (data_ref)) 2452 { 2453 if (dump_enabled_p ()) 2454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2455 "interleaved store with gaps\n"); 2456 return false; 2457 } 2458 2459 gaps += diff - 1; 2460 } 2461 2462 last_accessed_element += diff; 2463 2464 /* Store the gap from the previous member of the group. If there is no 2465 gap in the access, GROUP_GAP is always 1. */ 2466 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2467 2468 prev_init = DR_INIT (data_ref); 2469 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2470 /* Count the number of data-refs in the chain. */ 2471 count++; 2472 } 2473 2474 if (groupsize == 0) 2475 groupsize = count + gaps; 2476 2477 /* This could be UINT_MAX but as we are generating code in a very 2478 inefficient way we have to cap earlier. See PR78699 for example. */ 2479 if (groupsize > 4096) 2480 { 2481 if (dump_enabled_p ()) 2482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2483 "group is too large\n"); 2484 return false; 2485 } 2486 2487 /* Check that the size of the interleaving is equal to count for stores, 2488 i.e., that there are no gaps. */ 2489 if (groupsize != count 2490 && !DR_IS_READ (dr)) 2491 { 2492 if (dump_enabled_p ()) 2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2494 "interleaved store with gaps\n"); 2495 return false; 2496 } 2497 2498 /* If there is a gap after the last load in the group it is the 2499 difference between the groupsize and the last accessed 2500 element. 2501 When there is no gap, this difference should be 0. */ 2502 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2503 2504 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2505 if (dump_enabled_p ()) 2506 { 2507 dump_printf_loc (MSG_NOTE, vect_location, 2508 "Detected interleaving "); 2509 if (DR_IS_READ (dr)) 2510 dump_printf (MSG_NOTE, "load "); 2511 else 2512 dump_printf (MSG_NOTE, "store "); 2513 dump_printf (MSG_NOTE, "of size %u starting with ", 2514 (unsigned)groupsize); 2515 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2516 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2517 dump_printf_loc (MSG_NOTE, vect_location, 2518 "There is a gap of %u elements after the group\n", 2519 GROUP_GAP (vinfo_for_stmt (stmt))); 2520 } 2521 2522 /* SLP: create an SLP data structure for every interleaving group of 2523 stores for further analysis in vect_analyse_slp. */ 2524 if (DR_IS_WRITE (dr) && !slp_impossible) 2525 { 2526 if (loop_vinfo) 2527 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2528 if (bb_vinfo) 2529 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2530 } 2531 } 2532 2533 return true; 2534 } 2535 2536 /* Analyze groups of accesses: check that DR belongs to a group of 2537 accesses of legal size, step, etc. Detect gaps, single element 2538 interleaving, and other special cases. Set grouped access info. 2539 Collect groups of strided stores for further use in SLP analysis. */ 2540 2541 static bool 2542 vect_analyze_group_access (struct data_reference *dr) 2543 { 2544 if (!vect_analyze_group_access_1 (dr)) 2545 { 2546 /* Dissolve the group if present. */ 2547 gimple *next; 2548 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2549 while (stmt) 2550 { 2551 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2552 next = GROUP_NEXT_ELEMENT (vinfo); 2553 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2554 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2555 stmt = next; 2556 } 2557 return false; 2558 } 2559 return true; 2560 } 2561 2562 /* Analyze the access pattern of the data-reference DR. 2563 In case of non-consecutive accesses call vect_analyze_group_access() to 2564 analyze groups of accesses. */ 2565 2566 static bool 2567 vect_analyze_data_ref_access (struct data_reference *dr) 2568 { 2569 tree step = DR_STEP (dr); 2570 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2571 gimple *stmt = DR_STMT (dr); 2572 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2573 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2574 struct loop *loop = NULL; 2575 2576 if (loop_vinfo) 2577 loop = LOOP_VINFO_LOOP (loop_vinfo); 2578 2579 if (loop_vinfo && !step) 2580 { 2581 if (dump_enabled_p ()) 2582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2583 "bad data-ref access in loop\n"); 2584 return false; 2585 } 2586 2587 /* Allow loads with zero step in inner-loop vectorization. */ 2588 if (loop_vinfo && integer_zerop (step)) 2589 { 2590 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2591 if (!nested_in_vect_loop_p (loop, stmt)) 2592 return DR_IS_READ (dr); 2593 /* Allow references with zero step for outer loops marked 2594 with pragma omp simd only - it guarantees absence of 2595 loop-carried dependencies between inner loop iterations. */ 2596 if (!loop->force_vectorize || loop->safelen < 2) 2597 { 2598 if (dump_enabled_p ()) 2599 dump_printf_loc (MSG_NOTE, vect_location, 2600 "zero step in inner loop of nest\n"); 2601 return false; 2602 } 2603 } 2604 2605 if (loop && nested_in_vect_loop_p (loop, stmt)) 2606 { 2607 /* Interleaved accesses are not yet supported within outer-loop 2608 vectorization for references in the inner-loop. */ 2609 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2610 2611 /* For the rest of the analysis we use the outer-loop step. */ 2612 step = STMT_VINFO_DR_STEP (stmt_info); 2613 if (integer_zerop (step)) 2614 { 2615 if (dump_enabled_p ()) 2616 dump_printf_loc (MSG_NOTE, vect_location, 2617 "zero step in outer loop.\n"); 2618 return DR_IS_READ (dr); 2619 } 2620 } 2621 2622 /* Consecutive? */ 2623 if (TREE_CODE (step) == INTEGER_CST) 2624 { 2625 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2626 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2627 || (dr_step < 0 2628 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2629 { 2630 /* Mark that it is not interleaving. */ 2631 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2632 return true; 2633 } 2634 } 2635 2636 if (loop && nested_in_vect_loop_p (loop, stmt)) 2637 { 2638 if (dump_enabled_p ()) 2639 dump_printf_loc (MSG_NOTE, vect_location, 2640 "grouped access in outer loop.\n"); 2641 return false; 2642 } 2643 2644 2645 /* Assume this is a DR handled by non-constant strided load case. */ 2646 if (TREE_CODE (step) != INTEGER_CST) 2647 return (STMT_VINFO_STRIDED_P (stmt_info) 2648 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2649 || vect_analyze_group_access (dr))); 2650 2651 /* Not consecutive access - check if it's a part of interleaving group. */ 2652 return vect_analyze_group_access (dr); 2653 } 2654 2655 2656 2657 /* A helper function used in the comparator function to sort data 2658 references. T1 and T2 are two data references to be compared. 2659 The function returns -1, 0, or 1. */ 2660 2661 static int 2662 compare_tree (tree t1, tree t2) 2663 { 2664 int i, cmp; 2665 enum tree_code code; 2666 char tclass; 2667 2668 if (t1 == t2) 2669 return 0; 2670 if (t1 == NULL) 2671 return -1; 2672 if (t2 == NULL) 2673 return 1; 2674 2675 STRIP_NOPS (t1); 2676 STRIP_NOPS (t2); 2677 2678 if (TREE_CODE (t1) != TREE_CODE (t2)) 2679 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1; 2680 2681 code = TREE_CODE (t1); 2682 switch (code) 2683 { 2684 /* For const values, we can just use hash values for comparisons. */ 2685 case INTEGER_CST: 2686 case REAL_CST: 2687 case FIXED_CST: 2688 case STRING_CST: 2689 case COMPLEX_CST: 2690 case VECTOR_CST: 2691 { 2692 hashval_t h1 = iterative_hash_expr (t1, 0); 2693 hashval_t h2 = iterative_hash_expr (t2, 0); 2694 if (h1 != h2) 2695 return h1 < h2 ? -1 : 1; 2696 break; 2697 } 2698 2699 case SSA_NAME: 2700 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2)); 2701 if (cmp != 0) 2702 return cmp; 2703 2704 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2)) 2705 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1; 2706 break; 2707 2708 default: 2709 tclass = TREE_CODE_CLASS (code); 2710 2711 /* For var-decl, we could compare their UIDs. */ 2712 if (tclass == tcc_declaration) 2713 { 2714 if (DECL_UID (t1) != DECL_UID (t2)) 2715 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1; 2716 break; 2717 } 2718 2719 /* For expressions with operands, compare their operands recursively. */ 2720 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i) 2721 { 2722 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i)); 2723 if (cmp != 0) 2724 return cmp; 2725 } 2726 } 2727 2728 return 0; 2729 } 2730 2731 2732 /* Compare two data-references DRA and DRB to group them into chunks 2733 suitable for grouping. */ 2734 2735 static int 2736 dr_group_sort_cmp (const void *dra_, const void *drb_) 2737 { 2738 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2739 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2740 int cmp; 2741 2742 /* Stabilize sort. */ 2743 if (dra == drb) 2744 return 0; 2745 2746 /* DRs in different loops never belong to the same group. */ 2747 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2748 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2749 if (loopa != loopb) 2750 return loopa->num < loopb->num ? -1 : 1; 2751 2752 /* Ordering of DRs according to base. */ 2753 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)) 2754 { 2755 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)); 2756 if (cmp != 0) 2757 return cmp; 2758 } 2759 2760 /* And according to DR_OFFSET. */ 2761 if (!dr_equal_offsets_p (dra, drb)) 2762 { 2763 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2764 if (cmp != 0) 2765 return cmp; 2766 } 2767 2768 /* Put reads before writes. */ 2769 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2770 return DR_IS_READ (dra) ? -1 : 1; 2771 2772 /* Then sort after access size. */ 2773 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2774 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0)) 2775 { 2776 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2777 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2778 if (cmp != 0) 2779 return cmp; 2780 } 2781 2782 /* And after step. */ 2783 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2784 { 2785 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb)); 2786 if (cmp != 0) 2787 return cmp; 2788 } 2789 2790 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2791 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)); 2792 if (cmp == 0) 2793 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2794 return cmp; 2795 } 2796 2797 /* Function vect_analyze_data_ref_accesses. 2798 2799 Analyze the access pattern of all the data references in the loop. 2800 2801 FORNOW: the only access pattern that is considered vectorizable is a 2802 simple step 1 (consecutive) access. 2803 2804 FORNOW: handle only arrays and pointer accesses. */ 2805 2806 bool 2807 vect_analyze_data_ref_accesses (vec_info *vinfo) 2808 { 2809 unsigned int i; 2810 vec<data_reference_p> datarefs = vinfo->datarefs; 2811 struct data_reference *dr; 2812 2813 if (dump_enabled_p ()) 2814 dump_printf_loc (MSG_NOTE, vect_location, 2815 "=== vect_analyze_data_ref_accesses ===\n"); 2816 2817 if (datarefs.is_empty ()) 2818 return true; 2819 2820 /* Sort the array of datarefs to make building the interleaving chains 2821 linear. Don't modify the original vector's order, it is needed for 2822 determining what dependencies are reversed. */ 2823 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2824 datarefs_copy.qsort (dr_group_sort_cmp); 2825 2826 /* Build the interleaving chains. */ 2827 for (i = 0; i < datarefs_copy.length () - 1;) 2828 { 2829 data_reference_p dra = datarefs_copy[i]; 2830 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2831 stmt_vec_info lastinfo = NULL; 2832 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a)) 2833 { 2834 ++i; 2835 continue; 2836 } 2837 for (i = i + 1; i < datarefs_copy.length (); ++i) 2838 { 2839 data_reference_p drb = datarefs_copy[i]; 2840 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2841 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 2842 break; 2843 2844 /* ??? Imperfect sorting (non-compatible types, non-modulo 2845 accesses, same accesses) can lead to a group to be artificially 2846 split here as we don't just skip over those. If it really 2847 matters we can push those to a worklist and re-iterate 2848 over them. The we can just skip ahead to the next DR here. */ 2849 2850 /* DRs in a different loop should not be put into the same 2851 interleaving group. */ 2852 if (gimple_bb (DR_STMT (dra))->loop_father 2853 != gimple_bb (DR_STMT (drb))->loop_father) 2854 break; 2855 2856 /* Check that the data-refs have same first location (except init) 2857 and they are both either store or load (not load and store, 2858 not masked loads or stores). */ 2859 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2860 || !operand_equal_p (DR_BASE_ADDRESS (dra), 2861 DR_BASE_ADDRESS (drb), 0) 2862 || !dr_equal_offsets_p (dra, drb) 2863 || !gimple_assign_single_p (DR_STMT (dra)) 2864 || !gimple_assign_single_p (DR_STMT (drb))) 2865 break; 2866 2867 /* Check that the data-refs have the same constant size. */ 2868 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2869 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2870 if (!tree_fits_uhwi_p (sza) 2871 || !tree_fits_uhwi_p (szb) 2872 || !tree_int_cst_equal (sza, szb)) 2873 break; 2874 2875 /* Check that the data-refs have the same step. */ 2876 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2877 break; 2878 2879 /* Do not place the same access in the interleaving chain twice. */ 2880 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) 2881 break; 2882 2883 /* Check the types are compatible. 2884 ??? We don't distinguish this during sorting. */ 2885 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2886 TREE_TYPE (DR_REF (drb)))) 2887 break; 2888 2889 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 2890 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 2891 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 2892 gcc_assert (init_a <= init_b); 2893 2894 /* If init_b == init_a + the size of the type * k, we have an 2895 interleaving, and DRA is accessed before DRB. */ 2896 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 2897 if (type_size_a == 0 2898 || (init_b - init_a) % type_size_a != 0) 2899 break; 2900 2901 /* If we have a store, the accesses are adjacent. This splits 2902 groups into chunks we support (we don't support vectorization 2903 of stores with gaps). */ 2904 if (!DR_IS_READ (dra) 2905 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW 2906 (DR_INIT (datarefs_copy[i-1])) 2907 != type_size_a)) 2908 break; 2909 2910 /* If the step (if not zero or non-constant) is greater than the 2911 difference between data-refs' inits this splits groups into 2912 suitable sizes. */ 2913 if (tree_fits_shwi_p (DR_STEP (dra))) 2914 { 2915 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 2916 if (step != 0 && step <= (init_b - init_a)) 2917 break; 2918 } 2919 2920 if (dump_enabled_p ()) 2921 { 2922 dump_printf_loc (MSG_NOTE, vect_location, 2923 "Detected interleaving "); 2924 if (DR_IS_READ (dra)) 2925 dump_printf (MSG_NOTE, "load "); 2926 else 2927 dump_printf (MSG_NOTE, "store "); 2928 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2929 dump_printf (MSG_NOTE, " and "); 2930 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2931 dump_printf (MSG_NOTE, "\n"); 2932 } 2933 2934 /* Link the found element into the group list. */ 2935 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 2936 { 2937 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 2938 lastinfo = stmtinfo_a; 2939 } 2940 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 2941 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 2942 lastinfo = stmtinfo_b; 2943 } 2944 } 2945 2946 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 2947 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 2948 && !vect_analyze_data_ref_access (dr)) 2949 { 2950 if (dump_enabled_p ()) 2951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2952 "not vectorized: complicated access pattern.\n"); 2953 2954 if (is_a <bb_vec_info> (vinfo)) 2955 { 2956 /* Mark the statement as not vectorizable. */ 2957 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2958 continue; 2959 } 2960 else 2961 { 2962 datarefs_copy.release (); 2963 return false; 2964 } 2965 } 2966 2967 datarefs_copy.release (); 2968 return true; 2969 } 2970 2971 2972 /* Operator == between two dr_with_seg_len objects. 2973 2974 This equality operator is used to make sure two data refs 2975 are the same one so that we will consider to combine the 2976 aliasing checks of those two pairs of data dependent data 2977 refs. */ 2978 2979 static bool 2980 operator == (const dr_with_seg_len& d1, 2981 const dr_with_seg_len& d2) 2982 { 2983 return operand_equal_p (DR_BASE_ADDRESS (d1.dr), 2984 DR_BASE_ADDRESS (d2.dr), 0) 2985 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0 2986 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0 2987 && compare_tree (d1.seg_len, d2.seg_len) == 0; 2988 } 2989 2990 /* Function comp_dr_with_seg_len_pair. 2991 2992 Comparison function for sorting objects of dr_with_seg_len_pair_t 2993 so that we can combine aliasing checks in one scan. */ 2994 2995 static int 2996 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_) 2997 { 2998 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_; 2999 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_; 3000 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second; 3001 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second; 3002 3003 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks 3004 if a and c have the same basic address snd step, and b and d have the same 3005 address and step. Therefore, if any a&c or b&d don't have the same address 3006 and step, we don't care the order of those two pairs after sorting. */ 3007 int comp_res; 3008 3009 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr), 3010 DR_BASE_ADDRESS (b1.dr))) != 0) 3011 return comp_res; 3012 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr), 3013 DR_BASE_ADDRESS (b2.dr))) != 0) 3014 return comp_res; 3015 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0) 3016 return comp_res; 3017 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0) 3018 return comp_res; 3019 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0) 3020 return comp_res; 3021 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0) 3022 return comp_res; 3023 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0) 3024 return comp_res; 3025 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0) 3026 return comp_res; 3027 3028 return 0; 3029 } 3030 3031 /* Function vect_vfa_segment_size. 3032 3033 Create an expression that computes the size of segment 3034 that will be accessed for a data reference. The functions takes into 3035 account that realignment loads may access one more vector. 3036 3037 Input: 3038 DR: The data reference. 3039 LENGTH_FACTOR: segment length to consider. 3040 3041 Return an expression whose value is the size of segment which will be 3042 accessed by DR. */ 3043 3044 static tree 3045 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3046 { 3047 tree segment_length; 3048 3049 if (integer_zerop (DR_STEP (dr))) 3050 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 3051 else 3052 segment_length = size_binop (MULT_EXPR, 3053 fold_convert (sizetype, DR_STEP (dr)), 3054 fold_convert (sizetype, length_factor)); 3055 3056 if (vect_supportable_dr_alignment (dr, false) 3057 == dr_explicit_realign_optimized) 3058 { 3059 tree vector_size = TYPE_SIZE_UNIT 3060 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); 3061 3062 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size); 3063 } 3064 return segment_length; 3065 } 3066 3067 /* Function vect_no_alias_p. 3068 3069 Given data references A and B with equal base and offset, the alias 3070 relation can be decided at compilation time, return TRUE if they do 3071 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A 3072 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B 3073 respectively. */ 3074 3075 static bool 3076 vect_no_alias_p (struct data_reference *a, struct data_reference *b, 3077 tree segment_length_a, tree segment_length_b) 3078 { 3079 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST 3080 && TREE_CODE (DR_INIT (b)) == INTEGER_CST); 3081 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b))) 3082 return false; 3083 3084 tree seg_a_min = DR_INIT (a); 3085 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min), 3086 seg_a_min, segment_length_a); 3087 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3088 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3089 [a, a+12) */ 3090 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3091 { 3092 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a))); 3093 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max), 3094 seg_a_max, unit_size); 3095 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)), 3096 DR_INIT (a), unit_size); 3097 } 3098 tree seg_b_min = DR_INIT (b); 3099 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min), 3100 seg_b_min, segment_length_b); 3101 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3102 { 3103 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b))); 3104 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max), 3105 seg_b_max, unit_size); 3106 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)), 3107 DR_INIT (b), unit_size); 3108 } 3109 3110 if (tree_int_cst_le (seg_a_max, seg_b_min) 3111 || tree_int_cst_le (seg_b_max, seg_a_min)) 3112 return true; 3113 3114 return false; 3115 } 3116 3117 /* Function vect_prune_runtime_alias_test_list. 3118 3119 Prune a list of ddrs to be tested at run-time by versioning for alias. 3120 Merge several alias checks into one if possible. 3121 Return FALSE if resulting list of ddrs is longer then allowed by 3122 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3123 3124 bool 3125 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3126 { 3127 vec<ddr_p> may_alias_ddrs = 3128 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3129 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs = 3130 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3131 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3132 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3133 3134 ddr_p ddr; 3135 unsigned int i; 3136 tree length_factor; 3137 3138 if (dump_enabled_p ()) 3139 dump_printf_loc (MSG_NOTE, vect_location, 3140 "=== vect_prune_runtime_alias_test_list ===\n"); 3141 3142 if (may_alias_ddrs.is_empty ()) 3143 return true; 3144 3145 /* Basically, for each pair of dependent data refs store_ptr_0 3146 and load_ptr_0, we create an expression: 3147 3148 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 3149 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0)) 3150 3151 for aliasing checks. However, in some cases we can decrease 3152 the number of checks by combining two checks into one. For 3153 example, suppose we have another pair of data refs store_ptr_0 3154 and load_ptr_1, and if the following condition is satisfied: 3155 3156 load_ptr_0 < load_ptr_1 && 3157 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0 3158 3159 (this condition means, in each iteration of vectorized loop, 3160 the accessed memory of store_ptr_0 cannot be between the memory 3161 of load_ptr_0 and load_ptr_1.) 3162 3163 we then can use only the following expression to finish the 3164 alising checks between store_ptr_0 & load_ptr_0 and 3165 store_ptr_0 & load_ptr_1: 3166 3167 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 3168 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0)) 3169 3170 Note that we only consider that load_ptr_0 and load_ptr_1 have the 3171 same basic address. */ 3172 3173 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3174 3175 /* First, we collect all data ref pairs for aliasing checks. */ 3176 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3177 { 3178 int comp_res; 3179 struct data_reference *dr_a, *dr_b; 3180 gimple *dr_group_first_a, *dr_group_first_b; 3181 tree segment_length_a, segment_length_b; 3182 gimple *stmt_a, *stmt_b; 3183 3184 dr_a = DDR_A (ddr); 3185 stmt_a = DR_STMT (DDR_A (ddr)); 3186 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3187 if (dr_group_first_a) 3188 { 3189 stmt_a = dr_group_first_a; 3190 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3191 } 3192 3193 dr_b = DDR_B (ddr); 3194 stmt_b = DR_STMT (DDR_B (ddr)); 3195 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3196 if (dr_group_first_b) 3197 { 3198 stmt_b = dr_group_first_b; 3199 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3200 } 3201 3202 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3203 length_factor = scalar_loop_iters; 3204 else 3205 length_factor = size_int (vect_factor); 3206 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3207 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3208 3209 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)); 3210 if (comp_res == 0) 3211 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); 3212 3213 /* Alias is known at compilation time. */ 3214 if (comp_res == 0 3215 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3216 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3217 && TREE_CODE (segment_length_a) == INTEGER_CST 3218 && TREE_CODE (segment_length_b) == INTEGER_CST) 3219 { 3220 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b)) 3221 continue; 3222 3223 if (dump_enabled_p ()) 3224 dump_printf_loc (MSG_NOTE, vect_location, 3225 "not vectorized: compilation time alias.\n"); 3226 3227 return false; 3228 } 3229 3230 dr_with_seg_len_pair_t dr_with_seg_len_pair 3231 (dr_with_seg_len (dr_a, segment_length_a), 3232 dr_with_seg_len (dr_b, segment_length_b)); 3233 3234 /* Canonicalize pairs by sorting the two DR members. */ 3235 if (comp_res > 0) 3236 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3237 3238 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3239 } 3240 3241 /* Second, we sort the collected data ref pairs so that we can scan 3242 them once to combine all possible aliasing checks. */ 3243 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair); 3244 3245 /* Third, we scan the sorted dr pairs and check if we can combine 3246 alias checks of two neighboring dr pairs. */ 3247 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i) 3248 { 3249 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ 3250 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first, 3251 *dr_b1 = &comp_alias_ddrs[i-1].second, 3252 *dr_a2 = &comp_alias_ddrs[i].first, 3253 *dr_b2 = &comp_alias_ddrs[i].second; 3254 3255 /* Remove duplicate data ref pairs. */ 3256 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) 3257 { 3258 if (dump_enabled_p ()) 3259 { 3260 dump_printf_loc (MSG_NOTE, vect_location, 3261 "found equal ranges "); 3262 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3263 DR_REF (dr_a1->dr)); 3264 dump_printf (MSG_NOTE, ", "); 3265 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3266 DR_REF (dr_b1->dr)); 3267 dump_printf (MSG_NOTE, " and "); 3268 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3269 DR_REF (dr_a2->dr)); 3270 dump_printf (MSG_NOTE, ", "); 3271 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3272 DR_REF (dr_b2->dr)); 3273 dump_printf (MSG_NOTE, "\n"); 3274 } 3275 3276 comp_alias_ddrs.ordered_remove (i--); 3277 continue; 3278 } 3279 3280 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) 3281 { 3282 /* We consider the case that DR_B1 and DR_B2 are same memrefs, 3283 and DR_A1 and DR_A2 are two consecutive memrefs. */ 3284 if (*dr_a1 == *dr_a2) 3285 { 3286 std::swap (dr_a1, dr_b1); 3287 std::swap (dr_a2, dr_b2); 3288 } 3289 3290 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), 3291 DR_BASE_ADDRESS (dr_a2->dr), 0) 3292 || !operand_equal_p (DR_OFFSET (dr_a1->dr), 3293 DR_OFFSET (dr_a2->dr), 0) 3294 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr)) 3295 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr))) 3296 continue; 3297 3298 /* Make sure dr_a1 starts left of dr_a2. */ 3299 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr))) 3300 std::swap (*dr_a1, *dr_a2); 3301 3302 bool do_remove = false; 3303 unsigned HOST_WIDE_INT diff 3304 = (tree_to_shwi (DR_INIT (dr_a2->dr)) 3305 - tree_to_shwi (DR_INIT (dr_a1->dr))); 3306 3307 /* If the left segment does not extend beyond the start of the 3308 right segment the new segment length is that of the right 3309 plus the segment distance. */ 3310 if (tree_fits_uhwi_p (dr_a1->seg_len) 3311 && compare_tree_int (dr_a1->seg_len, diff) <= 0) 3312 { 3313 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len, 3314 size_int (diff)); 3315 do_remove = true; 3316 } 3317 /* Generally the new segment length is the maximum of the 3318 left segment size and the right segment size plus the distance. 3319 ??? We can also build tree MAX_EXPR here but it's not clear this 3320 is profitable. */ 3321 else if (tree_fits_uhwi_p (dr_a1->seg_len) 3322 && tree_fits_uhwi_p (dr_a2->seg_len)) 3323 { 3324 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len); 3325 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len); 3326 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2)); 3327 do_remove = true; 3328 } 3329 /* Now we check if the following condition is satisfied: 3330 3331 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B 3332 3333 where DIFF = DR_A2_INIT - DR_A1_INIT. However, 3334 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we 3335 have to make a best estimation. We can get the minimum value 3336 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B, 3337 then either of the following two conditions can guarantee the 3338 one above: 3339 3340 1: DIFF <= MIN_SEG_LEN_B 3341 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */ 3342 else 3343 { 3344 unsigned HOST_WIDE_INT min_seg_len_b 3345 = (tree_fits_uhwi_p (dr_b1->seg_len) 3346 ? tree_to_uhwi (dr_b1->seg_len) 3347 : vect_factor); 3348 3349 if (diff <= min_seg_len_b 3350 || (tree_fits_uhwi_p (dr_a1->seg_len) 3351 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b)) 3352 { 3353 dr_a1->seg_len = size_binop (PLUS_EXPR, 3354 dr_a2->seg_len, size_int (diff)); 3355 do_remove = true; 3356 } 3357 } 3358 3359 if (do_remove) 3360 { 3361 if (dump_enabled_p ()) 3362 { 3363 dump_printf_loc (MSG_NOTE, vect_location, 3364 "merging ranges for "); 3365 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr)); 3366 dump_printf (MSG_NOTE, ", "); 3367 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr)); 3368 dump_printf (MSG_NOTE, " and "); 3369 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr)); 3370 dump_printf (MSG_NOTE, ", "); 3371 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr)); 3372 dump_printf (MSG_NOTE, "\n"); 3373 } 3374 comp_alias_ddrs.ordered_remove (i--); 3375 } 3376 } 3377 } 3378 3379 dump_printf_loc (MSG_NOTE, vect_location, 3380 "improved number of alias checks from %d to %d\n", 3381 may_alias_ddrs.length (), comp_alias_ddrs.length ()); 3382 if ((int) comp_alias_ddrs.length () > 3383 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3384 { 3385 if (dump_enabled_p ()) 3386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3387 "number of versioning for alias " 3388 "run-time tests exceeds %d " 3389 "(--param vect-max-version-for-alias-checks)\n", 3390 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3391 return false; 3392 } 3393 3394 /* All alias checks have been resolved at compilation time. */ 3395 if (!comp_alias_ddrs.length ()) 3396 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0); 3397 3398 return true; 3399 } 3400 3401 /* Return true if a non-affine read or write in STMT is suitable for a 3402 gather load or scatter store. Describe the operation in *INFO if so. */ 3403 3404 bool 3405 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3406 gather_scatter_info *info) 3407 { 3408 HOST_WIDE_INT scale = 1, pbitpos, pbitsize; 3409 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3411 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3412 tree offtype = NULL_TREE; 3413 tree decl, base, off; 3414 machine_mode pmode; 3415 int punsignedp, reversep, pvolatilep = 0; 3416 3417 base = DR_REF (dr); 3418 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3419 see if we can use the def stmt of the address. */ 3420 if (is_gimple_call (stmt) 3421 && gimple_call_internal_p (stmt) 3422 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 3423 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE) 3424 && TREE_CODE (base) == MEM_REF 3425 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3426 && integer_zerop (TREE_OPERAND (base, 1)) 3427 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3428 { 3429 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3430 if (is_gimple_assign (def_stmt) 3431 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3432 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3433 } 3434 3435 /* The gather and scatter builtins need address of the form 3436 loop_invariant + vector * {1, 2, 4, 8} 3437 or 3438 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3439 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3440 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3441 multiplications and additions in it. To get a vector, we need 3442 a single SSA_NAME that will be defined in the loop and will 3443 contain everything that is not loop invariant and that can be 3444 vectorized. The following code attempts to find such a preexistng 3445 SSA_NAME OFF and put the loop invariants into a tree BASE 3446 that can be gimplified before the loop. */ 3447 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3448 &punsignedp, &reversep, &pvolatilep); 3449 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep); 3450 3451 if (TREE_CODE (base) == MEM_REF) 3452 { 3453 if (!integer_zerop (TREE_OPERAND (base, 1))) 3454 { 3455 if (off == NULL_TREE) 3456 { 3457 offset_int moff = mem_ref_offset (base); 3458 off = wide_int_to_tree (sizetype, moff); 3459 } 3460 else 3461 off = size_binop (PLUS_EXPR, off, 3462 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3463 } 3464 base = TREE_OPERAND (base, 0); 3465 } 3466 else 3467 base = build_fold_addr_expr (base); 3468 3469 if (off == NULL_TREE) 3470 off = size_zero_node; 3471 3472 /* If base is not loop invariant, either off is 0, then we start with just 3473 the constant offset in the loop invariant BASE and continue with base 3474 as OFF, otherwise give up. 3475 We could handle that case by gimplifying the addition of base + off 3476 into some SSA_NAME and use that as off, but for now punt. */ 3477 if (!expr_invariant_in_loop_p (loop, base)) 3478 { 3479 if (!integer_zerop (off)) 3480 return false; 3481 off = base; 3482 base = size_int (pbitpos / BITS_PER_UNIT); 3483 } 3484 /* Otherwise put base + constant offset into the loop invariant BASE 3485 and continue with OFF. */ 3486 else 3487 { 3488 base = fold_convert (sizetype, base); 3489 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); 3490 } 3491 3492 /* OFF at this point may be either a SSA_NAME or some tree expression 3493 from get_inner_reference. Try to peel off loop invariants from it 3494 into BASE as long as possible. */ 3495 STRIP_NOPS (off); 3496 while (offtype == NULL_TREE) 3497 { 3498 enum tree_code code; 3499 tree op0, op1, add = NULL_TREE; 3500 3501 if (TREE_CODE (off) == SSA_NAME) 3502 { 3503 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3504 3505 if (expr_invariant_in_loop_p (loop, off)) 3506 return false; 3507 3508 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3509 break; 3510 3511 op0 = gimple_assign_rhs1 (def_stmt); 3512 code = gimple_assign_rhs_code (def_stmt); 3513 op1 = gimple_assign_rhs2 (def_stmt); 3514 } 3515 else 3516 { 3517 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3518 return false; 3519 code = TREE_CODE (off); 3520 extract_ops_from_tree (off, &code, &op0, &op1); 3521 } 3522 switch (code) 3523 { 3524 case POINTER_PLUS_EXPR: 3525 case PLUS_EXPR: 3526 if (expr_invariant_in_loop_p (loop, op0)) 3527 { 3528 add = op0; 3529 off = op1; 3530 do_add: 3531 add = fold_convert (sizetype, add); 3532 if (scale != 1) 3533 add = size_binop (MULT_EXPR, add, size_int (scale)); 3534 base = size_binop (PLUS_EXPR, base, add); 3535 continue; 3536 } 3537 if (expr_invariant_in_loop_p (loop, op1)) 3538 { 3539 add = op1; 3540 off = op0; 3541 goto do_add; 3542 } 3543 break; 3544 case MINUS_EXPR: 3545 if (expr_invariant_in_loop_p (loop, op1)) 3546 { 3547 add = fold_convert (sizetype, op1); 3548 add = size_binop (MINUS_EXPR, size_zero_node, add); 3549 off = op0; 3550 goto do_add; 3551 } 3552 break; 3553 case MULT_EXPR: 3554 if (scale == 1 && tree_fits_shwi_p (op1)) 3555 { 3556 scale = tree_to_shwi (op1); 3557 off = op0; 3558 continue; 3559 } 3560 break; 3561 case SSA_NAME: 3562 off = op0; 3563 continue; 3564 CASE_CONVERT: 3565 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3566 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3567 break; 3568 if (TYPE_PRECISION (TREE_TYPE (op0)) 3569 == TYPE_PRECISION (TREE_TYPE (off))) 3570 { 3571 off = op0; 3572 continue; 3573 } 3574 if (TYPE_PRECISION (TREE_TYPE (op0)) 3575 < TYPE_PRECISION (TREE_TYPE (off))) 3576 { 3577 off = op0; 3578 offtype = TREE_TYPE (off); 3579 STRIP_NOPS (off); 3580 continue; 3581 } 3582 break; 3583 default: 3584 break; 3585 } 3586 break; 3587 } 3588 3589 /* If at the end OFF still isn't a SSA_NAME or isn't 3590 defined in the loop, punt. */ 3591 if (TREE_CODE (off) != SSA_NAME 3592 || expr_invariant_in_loop_p (loop, off)) 3593 return false; 3594 3595 if (offtype == NULL_TREE) 3596 offtype = TREE_TYPE (off); 3597 3598 if (DR_IS_READ (dr)) 3599 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), 3600 offtype, scale); 3601 else 3602 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info), 3603 offtype, scale); 3604 3605 if (decl == NULL_TREE) 3606 return false; 3607 3608 info->decl = decl; 3609 info->base = base; 3610 info->offset = off; 3611 info->offset_dt = vect_unknown_def_type; 3612 info->offset_vectype = NULL_TREE; 3613 info->scale = scale; 3614 return true; 3615 } 3616 3617 /* Function vect_analyze_data_refs. 3618 3619 Find all the data references in the loop or basic block. 3620 3621 The general structure of the analysis of data refs in the vectorizer is as 3622 follows: 3623 1- vect_analyze_data_refs(loop/bb): call 3624 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3625 in the loop/bb and their dependences. 3626 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3627 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3628 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3629 3630 */ 3631 3632 bool 3633 vect_analyze_data_refs (vec_info *vinfo, int *min_vf) 3634 { 3635 struct loop *loop = NULL; 3636 unsigned int i; 3637 struct data_reference *dr; 3638 tree scalar_type; 3639 3640 if (dump_enabled_p ()) 3641 dump_printf_loc (MSG_NOTE, vect_location, 3642 "=== vect_analyze_data_refs ===\n"); 3643 3644 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 3645 loop = LOOP_VINFO_LOOP (loop_vinfo); 3646 3647 /* Go through the data-refs, check that the analysis succeeded. Update 3648 pointer from stmt_vec_info struct to DR and vectype. */ 3649 3650 vec<data_reference_p> datarefs = vinfo->datarefs; 3651 FOR_EACH_VEC_ELT (datarefs, i, dr) 3652 { 3653 gimple *stmt; 3654 stmt_vec_info stmt_info; 3655 tree base, offset, init; 3656 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 3657 bool simd_lane_access = false; 3658 int vf; 3659 3660 again: 3661 if (!dr || !DR_REF (dr)) 3662 { 3663 if (dump_enabled_p ()) 3664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3665 "not vectorized: unhandled data-ref\n"); 3666 return false; 3667 } 3668 3669 stmt = DR_STMT (dr); 3670 stmt_info = vinfo_for_stmt (stmt); 3671 3672 /* Discard clobbers from the dataref vector. We will remove 3673 clobber stmts during vectorization. */ 3674 if (gimple_clobber_p (stmt)) 3675 { 3676 free_data_ref (dr); 3677 if (i == datarefs.length () - 1) 3678 { 3679 datarefs.pop (); 3680 break; 3681 } 3682 datarefs.ordered_remove (i); 3683 dr = datarefs[i]; 3684 goto again; 3685 } 3686 3687 /* Check that analysis of the data-ref succeeded. */ 3688 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 3689 || !DR_STEP (dr)) 3690 { 3691 bool maybe_gather 3692 = DR_IS_READ (dr) 3693 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3694 && targetm.vectorize.builtin_gather != NULL; 3695 bool maybe_scatter 3696 = DR_IS_WRITE (dr) 3697 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3698 && targetm.vectorize.builtin_scatter != NULL; 3699 bool maybe_simd_lane_access 3700 = is_a <loop_vec_info> (vinfo) && loop->simduid; 3701 3702 /* If target supports vector gather loads or scatter stores, or if 3703 this might be a SIMD lane access, see if they can't be used. */ 3704 if (is_a <loop_vec_info> (vinfo) 3705 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 3706 && !nested_in_vect_loop_p (loop, stmt)) 3707 { 3708 struct data_reference *newdr 3709 = create_data_ref (NULL, loop_containing_stmt (stmt), 3710 DR_REF (dr), stmt, maybe_scatter ? false : true); 3711 gcc_assert (newdr != NULL && DR_REF (newdr)); 3712 if (DR_BASE_ADDRESS (newdr) 3713 && DR_OFFSET (newdr) 3714 && DR_INIT (newdr) 3715 && DR_STEP (newdr) 3716 && integer_zerop (DR_STEP (newdr))) 3717 { 3718 if (maybe_simd_lane_access) 3719 { 3720 tree off = DR_OFFSET (newdr); 3721 STRIP_NOPS (off); 3722 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 3723 && TREE_CODE (off) == MULT_EXPR 3724 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 3725 { 3726 tree step = TREE_OPERAND (off, 1); 3727 off = TREE_OPERAND (off, 0); 3728 STRIP_NOPS (off); 3729 if (CONVERT_EXPR_P (off) 3730 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 3731 0))) 3732 < TYPE_PRECISION (TREE_TYPE (off))) 3733 off = TREE_OPERAND (off, 0); 3734 if (TREE_CODE (off) == SSA_NAME) 3735 { 3736 gimple *def = SSA_NAME_DEF_STMT (off); 3737 tree reft = TREE_TYPE (DR_REF (newdr)); 3738 if (is_gimple_call (def) 3739 && gimple_call_internal_p (def) 3740 && (gimple_call_internal_fn (def) 3741 == IFN_GOMP_SIMD_LANE)) 3742 { 3743 tree arg = gimple_call_arg (def, 0); 3744 gcc_assert (TREE_CODE (arg) == SSA_NAME); 3745 arg = SSA_NAME_VAR (arg); 3746 if (arg == loop->simduid 3747 /* For now. */ 3748 && tree_int_cst_equal 3749 (TYPE_SIZE_UNIT (reft), 3750 step)) 3751 { 3752 DR_OFFSET (newdr) = ssize_int (0); 3753 DR_STEP (newdr) = step; 3754 DR_ALIGNED_TO (newdr) 3755 = size_int (BIGGEST_ALIGNMENT); 3756 dr = newdr; 3757 simd_lane_access = true; 3758 } 3759 } 3760 } 3761 } 3762 } 3763 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 3764 { 3765 dr = newdr; 3766 if (maybe_gather) 3767 gatherscatter = GATHER; 3768 else 3769 gatherscatter = SCATTER; 3770 } 3771 } 3772 if (gatherscatter == SG_NONE && !simd_lane_access) 3773 free_data_ref (newdr); 3774 } 3775 3776 if (gatherscatter == SG_NONE && !simd_lane_access) 3777 { 3778 if (dump_enabled_p ()) 3779 { 3780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3781 "not vectorized: data ref analysis " 3782 "failed "); 3783 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3784 } 3785 3786 if (is_a <bb_vec_info> (vinfo)) 3787 break; 3788 3789 return false; 3790 } 3791 } 3792 3793 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 3794 { 3795 if (dump_enabled_p ()) 3796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3797 "not vectorized: base addr of dr is a " 3798 "constant\n"); 3799 3800 if (is_a <bb_vec_info> (vinfo)) 3801 break; 3802 3803 if (gatherscatter != SG_NONE || simd_lane_access) 3804 free_data_ref (dr); 3805 return false; 3806 } 3807 3808 if (TREE_THIS_VOLATILE (DR_REF (dr))) 3809 { 3810 if (dump_enabled_p ()) 3811 { 3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3813 "not vectorized: volatile type "); 3814 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3815 } 3816 3817 if (is_a <bb_vec_info> (vinfo)) 3818 break; 3819 3820 return false; 3821 } 3822 3823 if (stmt_can_throw_internal (stmt)) 3824 { 3825 if (dump_enabled_p ()) 3826 { 3827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3828 "not vectorized: statement can throw an " 3829 "exception "); 3830 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3831 } 3832 3833 if (is_a <bb_vec_info> (vinfo)) 3834 break; 3835 3836 if (gatherscatter != SG_NONE || simd_lane_access) 3837 free_data_ref (dr); 3838 return false; 3839 } 3840 3841 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 3842 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 3843 { 3844 if (dump_enabled_p ()) 3845 { 3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3847 "not vectorized: statement is bitfield " 3848 "access "); 3849 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3850 } 3851 3852 if (is_a <bb_vec_info> (vinfo)) 3853 break; 3854 3855 if (gatherscatter != SG_NONE || simd_lane_access) 3856 free_data_ref (dr); 3857 return false; 3858 } 3859 3860 base = unshare_expr (DR_BASE_ADDRESS (dr)); 3861 offset = unshare_expr (DR_OFFSET (dr)); 3862 init = unshare_expr (DR_INIT (dr)); 3863 3864 if (is_gimple_call (stmt) 3865 && (!gimple_call_internal_p (stmt) 3866 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 3867 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 3868 { 3869 if (dump_enabled_p ()) 3870 { 3871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3872 "not vectorized: dr in a call "); 3873 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3874 } 3875 3876 if (is_a <bb_vec_info> (vinfo)) 3877 break; 3878 3879 if (gatherscatter != SG_NONE || simd_lane_access) 3880 free_data_ref (dr); 3881 return false; 3882 } 3883 3884 /* Update DR field in stmt_vec_info struct. */ 3885 3886 /* If the dataref is in an inner-loop of the loop that is considered for 3887 for vectorization, we also want to analyze the access relative to 3888 the outer-loop (DR contains information only relative to the 3889 inner-most enclosing loop). We do that by building a reference to the 3890 first location accessed by the inner-loop, and analyze it relative to 3891 the outer-loop. */ 3892 if (loop && nested_in_vect_loop_p (loop, stmt)) 3893 { 3894 tree outer_step, outer_base, outer_init; 3895 HOST_WIDE_INT pbitsize, pbitpos; 3896 tree poffset; 3897 machine_mode pmode; 3898 int punsignedp, preversep, pvolatilep; 3899 affine_iv base_iv, offset_iv; 3900 tree dinit; 3901 3902 /* Build a reference to the first location accessed by the 3903 inner-loop: *(BASE+INIT). (The first location is actually 3904 BASE+INIT+OFFSET, but we add OFFSET separately later). */ 3905 tree inner_base = build_fold_indirect_ref 3906 (fold_build_pointer_plus (base, init)); 3907 3908 if (dump_enabled_p ()) 3909 { 3910 dump_printf_loc (MSG_NOTE, vect_location, 3911 "analyze in outer-loop: "); 3912 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); 3913 dump_printf (MSG_NOTE, "\n"); 3914 } 3915 3916 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 3917 &poffset, &pmode, &punsignedp, 3918 &preversep, &pvolatilep); 3919 gcc_assert (outer_base != NULL_TREE); 3920 3921 if (pbitpos % BITS_PER_UNIT != 0) 3922 { 3923 if (dump_enabled_p ()) 3924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3925 "failed: bit offset alignment.\n"); 3926 return false; 3927 } 3928 3929 if (preversep) 3930 { 3931 if (dump_enabled_p ()) 3932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3933 "failed: reverse storage order.\n"); 3934 return false; 3935 } 3936 3937 outer_base = build_fold_addr_expr (outer_base); 3938 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, 3939 &base_iv, false)) 3940 { 3941 if (dump_enabled_p ()) 3942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3943 "failed: evolution of base is not affine.\n"); 3944 return false; 3945 } 3946 3947 if (offset) 3948 { 3949 if (poffset) 3950 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, 3951 poffset); 3952 else 3953 poffset = offset; 3954 } 3955 3956 if (!poffset) 3957 { 3958 offset_iv.base = ssize_int (0); 3959 offset_iv.step = ssize_int (0); 3960 } 3961 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, 3962 &offset_iv, false)) 3963 { 3964 if (dump_enabled_p ()) 3965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3966 "evolution of offset is not affine.\n"); 3967 return false; 3968 } 3969 3970 outer_init = ssize_int (pbitpos / BITS_PER_UNIT); 3971 split_constant_offset (base_iv.base, &base_iv.base, &dinit); 3972 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3973 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); 3974 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3975 3976 outer_step = size_binop (PLUS_EXPR, 3977 fold_convert (ssizetype, base_iv.step), 3978 fold_convert (ssizetype, offset_iv.step)); 3979 3980 STMT_VINFO_DR_STEP (stmt_info) = outer_step; 3981 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ 3982 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 3983 STMT_VINFO_DR_INIT (stmt_info) = outer_init; 3984 STMT_VINFO_DR_OFFSET (stmt_info) = 3985 fold_convert (ssizetype, offset_iv.base); 3986 STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 3987 size_int (highest_pow2_factor (offset_iv.base)); 3988 3989 if (dump_enabled_p ()) 3990 { 3991 dump_printf_loc (MSG_NOTE, vect_location, 3992 "\touter base_address: "); 3993 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3994 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3995 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 3996 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3997 STMT_VINFO_DR_OFFSET (stmt_info)); 3998 dump_printf (MSG_NOTE, 3999 "\n\touter constant offset from base address: "); 4000 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4001 STMT_VINFO_DR_INIT (stmt_info)); 4002 dump_printf (MSG_NOTE, "\n\touter step: "); 4003 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4004 STMT_VINFO_DR_STEP (stmt_info)); 4005 dump_printf (MSG_NOTE, "\n\touter aligned to: "); 4006 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4007 STMT_VINFO_DR_ALIGNED_TO (stmt_info)); 4008 dump_printf (MSG_NOTE, "\n"); 4009 } 4010 } 4011 4012 if (STMT_VINFO_DATA_REF (stmt_info)) 4013 { 4014 if (dump_enabled_p ()) 4015 { 4016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4017 "not vectorized: more than one data ref " 4018 "in stmt: "); 4019 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4020 } 4021 4022 if (is_a <bb_vec_info> (vinfo)) 4023 break; 4024 4025 if (gatherscatter != SG_NONE || simd_lane_access) 4026 free_data_ref (dr); 4027 return false; 4028 } 4029 4030 STMT_VINFO_DATA_REF (stmt_info) = dr; 4031 if (simd_lane_access) 4032 { 4033 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 4034 free_data_ref (datarefs[i]); 4035 datarefs[i] = dr; 4036 } 4037 4038 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4039 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4040 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4041 { 4042 if (dump_enabled_p ()) 4043 { 4044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4045 "not vectorized: base object not addressable " 4046 "for stmt: "); 4047 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4048 } 4049 if (is_a <bb_vec_info> (vinfo)) 4050 { 4051 /* In BB vectorization the ref can still participate 4052 in dependence analysis, we just can't vectorize it. */ 4053 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4054 continue; 4055 } 4056 return false; 4057 } 4058 4059 /* Set vectype for STMT. */ 4060 scalar_type = TREE_TYPE (DR_REF (dr)); 4061 STMT_VINFO_VECTYPE (stmt_info) 4062 = get_vectype_for_scalar_type (scalar_type); 4063 if (!STMT_VINFO_VECTYPE (stmt_info)) 4064 { 4065 if (dump_enabled_p ()) 4066 { 4067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4068 "not vectorized: no vectype for stmt: "); 4069 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4070 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4071 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4072 scalar_type); 4073 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4074 } 4075 4076 if (is_a <bb_vec_info> (vinfo)) 4077 { 4078 /* No vector type is fine, the ref can still participate 4079 in dependence analysis, we just can't vectorize it. */ 4080 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4081 continue; 4082 } 4083 4084 if (gatherscatter != SG_NONE || simd_lane_access) 4085 { 4086 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4087 if (gatherscatter != SG_NONE) 4088 free_data_ref (dr); 4089 } 4090 return false; 4091 } 4092 else 4093 { 4094 if (dump_enabled_p ()) 4095 { 4096 dump_printf_loc (MSG_NOTE, vect_location, 4097 "got vectype for stmt: "); 4098 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4099 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4100 STMT_VINFO_VECTYPE (stmt_info)); 4101 dump_printf (MSG_NOTE, "\n"); 4102 } 4103 } 4104 4105 /* Adjust the minimal vectorization factor according to the 4106 vector type. */ 4107 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4108 if (vf > *min_vf) 4109 *min_vf = vf; 4110 4111 if (gatherscatter != SG_NONE) 4112 { 4113 gather_scatter_info gs_info; 4114 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4115 &gs_info) 4116 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4117 { 4118 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4119 free_data_ref (dr); 4120 if (dump_enabled_p ()) 4121 { 4122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4123 (gatherscatter == GATHER) ? 4124 "not vectorized: not suitable for gather " 4125 "load " : 4126 "not vectorized: not suitable for scatter " 4127 "store "); 4128 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4129 } 4130 return false; 4131 } 4132 4133 free_data_ref (datarefs[i]); 4134 datarefs[i] = dr; 4135 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4136 } 4137 4138 else if (is_a <loop_vec_info> (vinfo) 4139 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4140 { 4141 if (nested_in_vect_loop_p (loop, stmt)) 4142 { 4143 if (dump_enabled_p ()) 4144 { 4145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4146 "not vectorized: not suitable for strided " 4147 "load "); 4148 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4149 } 4150 return false; 4151 } 4152 STMT_VINFO_STRIDED_P (stmt_info) = true; 4153 } 4154 } 4155 4156 /* If we stopped analysis at the first dataref we could not analyze 4157 when trying to vectorize a basic-block mark the rest of the datarefs 4158 as not vectorizable and truncate the vector of datarefs. That 4159 avoids spending useless time in analyzing their dependence. */ 4160 if (i != datarefs.length ()) 4161 { 4162 gcc_assert (is_a <bb_vec_info> (vinfo)); 4163 for (unsigned j = i; j < datarefs.length (); ++j) 4164 { 4165 data_reference_p dr = datarefs[j]; 4166 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4167 free_data_ref (dr); 4168 } 4169 datarefs.truncate (i); 4170 } 4171 4172 return true; 4173 } 4174 4175 4176 /* Function vect_get_new_vect_var. 4177 4178 Returns a name for a new variable. The current naming scheme appends the 4179 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4180 the name of vectorizer generated variables, and appends that to NAME if 4181 provided. */ 4182 4183 tree 4184 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4185 { 4186 const char *prefix; 4187 tree new_vect_var; 4188 4189 switch (var_kind) 4190 { 4191 case vect_simple_var: 4192 prefix = "vect"; 4193 break; 4194 case vect_scalar_var: 4195 prefix = "stmp"; 4196 break; 4197 case vect_mask_var: 4198 prefix = "mask"; 4199 break; 4200 case vect_pointer_var: 4201 prefix = "vectp"; 4202 break; 4203 default: 4204 gcc_unreachable (); 4205 } 4206 4207 if (name) 4208 { 4209 char* tmp = concat (prefix, "_", name, NULL); 4210 new_vect_var = create_tmp_reg (type, tmp); 4211 free (tmp); 4212 } 4213 else 4214 new_vect_var = create_tmp_reg (type, prefix); 4215 4216 return new_vect_var; 4217 } 4218 4219 /* Like vect_get_new_vect_var but return an SSA name. */ 4220 4221 tree 4222 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4223 { 4224 const char *prefix; 4225 tree new_vect_var; 4226 4227 switch (var_kind) 4228 { 4229 case vect_simple_var: 4230 prefix = "vect"; 4231 break; 4232 case vect_scalar_var: 4233 prefix = "stmp"; 4234 break; 4235 case vect_pointer_var: 4236 prefix = "vectp"; 4237 break; 4238 default: 4239 gcc_unreachable (); 4240 } 4241 4242 if (name) 4243 { 4244 char* tmp = concat (prefix, "_", name, NULL); 4245 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4246 free (tmp); 4247 } 4248 else 4249 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4250 4251 return new_vect_var; 4252 } 4253 4254 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4255 4256 static void 4257 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr, 4258 stmt_vec_info stmt_info) 4259 { 4260 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4261 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); 4262 int misalign = DR_MISALIGNMENT (dr); 4263 if (misalign == -1) 4264 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4265 else 4266 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign); 4267 } 4268 4269 /* Function vect_create_addr_base_for_vector_ref. 4270 4271 Create an expression that computes the address of the first memory location 4272 that will be accessed for a data reference. 4273 4274 Input: 4275 STMT: The statement containing the data reference. 4276 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4277 OFFSET: Optional. If supplied, it is be added to the initial address. 4278 LOOP: Specify relative to which loop-nest should the address be computed. 4279 For example, when the dataref is in an inner-loop nested in an 4280 outer-loop that is now being vectorized, LOOP can be either the 4281 outer-loop, or the inner-loop. The first memory location accessed 4282 by the following dataref ('in' points to short): 4283 4284 for (i=0; i<N; i++) 4285 for (j=0; j<M; j++) 4286 s += in[i+j] 4287 4288 is as follows: 4289 if LOOP=i_loop: &in (relative to i_loop) 4290 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4291 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4292 initial address. Unlike OFFSET, which is number of elements to 4293 be added, BYTE_OFFSET is measured in bytes. 4294 4295 Output: 4296 1. Return an SSA_NAME whose value is the address of the memory location of 4297 the first vector of the data reference. 4298 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4299 these statement(s) which define the returned SSA_NAME. 4300 4301 FORNOW: We are only handling array accesses with step 1. */ 4302 4303 tree 4304 vect_create_addr_base_for_vector_ref (gimple *stmt, 4305 gimple_seq *new_stmt_list, 4306 tree offset, 4307 struct loop *loop, 4308 tree byte_offset) 4309 { 4310 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4311 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4312 tree data_ref_base; 4313 const char *base_name; 4314 tree addr_base; 4315 tree dest; 4316 gimple_seq seq = NULL; 4317 tree base_offset; 4318 tree init; 4319 tree vect_ptr_type; 4320 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4321 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4322 4323 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) 4324 { 4325 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); 4326 4327 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); 4328 4329 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4330 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); 4331 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); 4332 } 4333 else 4334 { 4335 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); 4336 base_offset = unshare_expr (DR_OFFSET (dr)); 4337 init = unshare_expr (DR_INIT (dr)); 4338 } 4339 4340 if (loop_vinfo) 4341 base_name = get_name (data_ref_base); 4342 else 4343 { 4344 base_offset = ssize_int (0); 4345 init = ssize_int (0); 4346 base_name = get_name (DR_REF (dr)); 4347 } 4348 4349 /* Create base_offset */ 4350 base_offset = size_binop (PLUS_EXPR, 4351 fold_convert (sizetype, base_offset), 4352 fold_convert (sizetype, init)); 4353 4354 if (offset) 4355 { 4356 offset = fold_build2 (MULT_EXPR, sizetype, 4357 fold_convert (sizetype, offset), step); 4358 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4359 base_offset, offset); 4360 } 4361 if (byte_offset) 4362 { 4363 byte_offset = fold_convert (sizetype, byte_offset); 4364 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4365 base_offset, byte_offset); 4366 } 4367 4368 /* base + base_offset */ 4369 if (loop_vinfo) 4370 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4371 else 4372 { 4373 addr_base = build1 (ADDR_EXPR, 4374 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4375 unshare_expr (DR_REF (dr))); 4376 } 4377 4378 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4379 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4380 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4381 gimple_seq_add_seq (new_stmt_list, seq); 4382 4383 if (DR_PTR_INFO (dr) 4384 && TREE_CODE (addr_base) == SSA_NAME 4385 && !SSA_NAME_PTR_INFO (addr_base)) 4386 { 4387 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info); 4388 if (offset || byte_offset) 4389 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4390 } 4391 4392 if (dump_enabled_p ()) 4393 { 4394 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4395 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4396 dump_printf (MSG_NOTE, "\n"); 4397 } 4398 4399 return addr_base; 4400 } 4401 4402 4403 /* Function vect_create_data_ref_ptr. 4404 4405 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4406 location accessed in the loop by STMT, along with the def-use update 4407 chain to appropriately advance the pointer through the loop iterations. 4408 Also set aliasing information for the pointer. This pointer is used by 4409 the callers to this function to create a memory reference expression for 4410 vector load/store access. 4411 4412 Input: 4413 1. STMT: a stmt that references memory. Expected to be of the form 4414 GIMPLE_ASSIGN <name, data-ref> or 4415 GIMPLE_ASSIGN <data-ref, name>. 4416 2. AGGR_TYPE: the type of the reference, which should be either a vector 4417 or an array. 4418 3. AT_LOOP: the loop where the vector memref is to be created. 4419 4. OFFSET (optional): an offset to be added to the initial address accessed 4420 by the data-ref in STMT. 4421 5. BSI: location where the new stmts are to be placed if there is no loop 4422 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4423 pointing to the initial address. 4424 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4425 to the initial address accessed by the data-ref in STMT. This is 4426 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4427 in bytes. 4428 4429 Output: 4430 1. Declare a new ptr to vector_type, and have it point to the base of the 4431 data reference (initial addressed accessed by the data reference). 4432 For example, for vector of type V8HI, the following code is generated: 4433 4434 v8hi *ap; 4435 ap = (v8hi *)initial_address; 4436 4437 if OFFSET is not supplied: 4438 initial_address = &a[init]; 4439 if OFFSET is supplied: 4440 initial_address = &a[init + OFFSET]; 4441 if BYTE_OFFSET is supplied: 4442 initial_address = &a[init] + BYTE_OFFSET; 4443 4444 Return the initial_address in INITIAL_ADDRESS. 4445 4446 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4447 update the pointer in each iteration of the loop. 4448 4449 Return the increment stmt that updates the pointer in PTR_INCR. 4450 4451 3. Set INV_P to true if the access pattern of the data reference in the 4452 vectorized loop is invariant. Set it to false otherwise. 4453 4454 4. Return the pointer. */ 4455 4456 tree 4457 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4458 tree offset, tree *initial_address, 4459 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4460 bool only_init, bool *inv_p, tree byte_offset) 4461 { 4462 const char *base_name; 4463 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4464 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4465 struct loop *loop = NULL; 4466 bool nested_in_vect_loop = false; 4467 struct loop *containing_loop = NULL; 4468 tree aggr_ptr_type; 4469 tree aggr_ptr; 4470 tree new_temp; 4471 gimple_seq new_stmt_list = NULL; 4472 edge pe = NULL; 4473 basic_block new_bb; 4474 tree aggr_ptr_init; 4475 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4476 tree aptr; 4477 gimple_stmt_iterator incr_gsi; 4478 bool insert_after; 4479 tree indx_before_incr, indx_after_incr; 4480 gimple *incr; 4481 tree step; 4482 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4483 4484 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE 4485 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4486 4487 if (loop_vinfo) 4488 { 4489 loop = LOOP_VINFO_LOOP (loop_vinfo); 4490 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4491 containing_loop = (gimple_bb (stmt))->loop_father; 4492 pe = loop_preheader_edge (loop); 4493 } 4494 else 4495 { 4496 gcc_assert (bb_vinfo); 4497 only_init = true; 4498 *ptr_incr = NULL; 4499 } 4500 4501 /* Check the step (evolution) of the load in LOOP, and record 4502 whether it's invariant. */ 4503 if (nested_in_vect_loop) 4504 step = STMT_VINFO_DR_STEP (stmt_info); 4505 else 4506 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); 4507 4508 if (integer_zerop (step)) 4509 *inv_p = true; 4510 else 4511 *inv_p = false; 4512 4513 /* Create an expression for the first address accessed by this load 4514 in LOOP. */ 4515 base_name = get_name (DR_BASE_ADDRESS (dr)); 4516 4517 if (dump_enabled_p ()) 4518 { 4519 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4520 dump_printf_loc (MSG_NOTE, vect_location, 4521 "create %s-pointer variable to type: ", 4522 get_tree_code_name (TREE_CODE (aggr_type))); 4523 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4524 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4525 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4526 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4527 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4528 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4529 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4530 else 4531 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4533 dump_printf (MSG_NOTE, "\n"); 4534 } 4535 4536 /* (1) Create the new aggregate-pointer variable. 4537 Vector and array types inherit the alias set of their component 4538 type by default so we need to use a ref-all pointer if the data 4539 reference does not conflict with the created aggregated data 4540 reference because it is not addressable. */ 4541 bool need_ref_all = false; 4542 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4543 get_alias_set (DR_REF (dr)))) 4544 need_ref_all = true; 4545 /* Likewise for any of the data references in the stmt group. */ 4546 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4547 { 4548 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4549 do 4550 { 4551 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4552 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4553 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4554 get_alias_set (DR_REF (sdr)))) 4555 { 4556 need_ref_all = true; 4557 break; 4558 } 4559 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4560 } 4561 while (orig_stmt); 4562 } 4563 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4564 need_ref_all); 4565 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4566 4567 4568 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4569 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4570 def-use update cycles for the pointer: one relative to the outer-loop 4571 (LOOP), which is what steps (3) and (4) below do. The other is relative 4572 to the inner-loop (which is the inner-most loop containing the dataref), 4573 and this is done be step (5) below. 4574 4575 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4576 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4577 redundant. Steps (3),(4) create the following: 4578 4579 vp0 = &base_addr; 4580 LOOP: vp1 = phi(vp0,vp2) 4581 ... 4582 ... 4583 vp2 = vp1 + step 4584 goto LOOP 4585 4586 If there is an inner-loop nested in loop, then step (5) will also be 4587 applied, and an additional update in the inner-loop will be created: 4588 4589 vp0 = &base_addr; 4590 LOOP: vp1 = phi(vp0,vp2) 4591 ... 4592 inner: vp3 = phi(vp1,vp4) 4593 vp4 = vp3 + inner_step 4594 if () goto inner 4595 ... 4596 vp2 = vp1 + step 4597 if () goto LOOP */ 4598 4599 /* (2) Calculate the initial address of the aggregate-pointer, and set 4600 the aggregate-pointer to point to it before the loop. */ 4601 4602 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4603 4604 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4605 offset, loop, byte_offset); 4606 if (new_stmt_list) 4607 { 4608 if (pe) 4609 { 4610 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4611 gcc_assert (!new_bb); 4612 } 4613 else 4614 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4615 } 4616 4617 *initial_address = new_temp; 4618 aggr_ptr_init = new_temp; 4619 4620 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4621 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4622 inner-loop nested in LOOP (during outer-loop vectorization). */ 4623 4624 /* No update in loop is required. */ 4625 if (only_init && (!loop_vinfo || at_loop == loop)) 4626 aptr = aggr_ptr_init; 4627 else 4628 { 4629 /* The step of the aggregate pointer is the type size. */ 4630 tree iv_step = TYPE_SIZE_UNIT (aggr_type); 4631 /* One exception to the above is when the scalar step of the load in 4632 LOOP is zero. In this case the step here is also zero. */ 4633 if (*inv_p) 4634 iv_step = size_zero_node; 4635 else if (tree_int_cst_sgn (step) == -1) 4636 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4637 4638 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4639 4640 create_iv (aggr_ptr_init, 4641 fold_convert (aggr_ptr_type, iv_step), 4642 aggr_ptr, loop, &incr_gsi, insert_after, 4643 &indx_before_incr, &indx_after_incr); 4644 incr = gsi_stmt (incr_gsi); 4645 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4646 4647 /* Copy the points-to information if it exists. */ 4648 if (DR_PTR_INFO (dr)) 4649 { 4650 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4651 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4652 } 4653 if (ptr_incr) 4654 *ptr_incr = incr; 4655 4656 aptr = indx_before_incr; 4657 } 4658 4659 if (!nested_in_vect_loop || only_init) 4660 return aptr; 4661 4662 4663 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4664 nested in LOOP, if exists. */ 4665 4666 gcc_assert (nested_in_vect_loop); 4667 if (!only_init) 4668 { 4669 standard_iv_increment_position (containing_loop, &incr_gsi, 4670 &insert_after); 4671 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4672 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4673 &indx_after_incr); 4674 incr = gsi_stmt (incr_gsi); 4675 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4676 4677 /* Copy the points-to information if it exists. */ 4678 if (DR_PTR_INFO (dr)) 4679 { 4680 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4681 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4682 } 4683 if (ptr_incr) 4684 *ptr_incr = incr; 4685 4686 return indx_before_incr; 4687 } 4688 else 4689 gcc_unreachable (); 4690 } 4691 4692 4693 /* Function bump_vector_ptr 4694 4695 Increment a pointer (to a vector type) by vector-size. If requested, 4696 i.e. if PTR-INCR is given, then also connect the new increment stmt 4697 to the existing def-use update-chain of the pointer, by modifying 4698 the PTR_INCR as illustrated below: 4699 4700 The pointer def-use update-chain before this function: 4701 DATAREF_PTR = phi (p_0, p_2) 4702 .... 4703 PTR_INCR: p_2 = DATAREF_PTR + step 4704 4705 The pointer def-use update-chain after this function: 4706 DATAREF_PTR = phi (p_0, p_2) 4707 .... 4708 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4709 .... 4710 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4711 4712 Input: 4713 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4714 in the loop. 4715 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4716 the loop. The increment amount across iterations is expected 4717 to be vector_size. 4718 BSI - location where the new update stmt is to be placed. 4719 STMT - the original scalar memory-access stmt that is being vectorized. 4720 BUMP - optional. The offset by which to bump the pointer. If not given, 4721 the offset is assumed to be vector_size. 4722 4723 Output: Return NEW_DATAREF_PTR as illustrated above. 4724 4725 */ 4726 4727 tree 4728 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4729 gimple *stmt, tree bump) 4730 { 4731 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4732 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4733 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4734 tree update = TYPE_SIZE_UNIT (vectype); 4735 gassign *incr_stmt; 4736 ssa_op_iter iter; 4737 use_operand_p use_p; 4738 tree new_dataref_ptr; 4739 4740 if (bump) 4741 update = bump; 4742 4743 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4744 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4745 else 4746 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4747 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4748 dataref_ptr, update); 4749 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4750 4751 /* Copy the points-to information if it exists. */ 4752 if (DR_PTR_INFO (dr)) 4753 { 4754 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4755 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4756 } 4757 4758 if (!ptr_incr) 4759 return new_dataref_ptr; 4760 4761 /* Update the vector-pointer's cross-iteration increment. */ 4762 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4763 { 4764 tree use = USE_FROM_PTR (use_p); 4765 4766 if (use == dataref_ptr) 4767 SET_USE (use_p, new_dataref_ptr); 4768 else 4769 gcc_assert (tree_int_cst_compare (use, update) == 0); 4770 } 4771 4772 return new_dataref_ptr; 4773 } 4774 4775 4776 /* Function vect_create_destination_var. 4777 4778 Create a new temporary of type VECTYPE. */ 4779 4780 tree 4781 vect_create_destination_var (tree scalar_dest, tree vectype) 4782 { 4783 tree vec_dest; 4784 const char *name; 4785 char *new_name; 4786 tree type; 4787 enum vect_var_kind kind; 4788 4789 kind = vectype 4790 ? VECTOR_BOOLEAN_TYPE_P (vectype) 4791 ? vect_mask_var 4792 : vect_simple_var 4793 : vect_scalar_var; 4794 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4795 4796 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4797 4798 name = get_name (scalar_dest); 4799 if (name) 4800 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 4801 else 4802 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 4803 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4804 free (new_name); 4805 4806 return vec_dest; 4807 } 4808 4809 /* Function vect_grouped_store_supported. 4810 4811 Returns TRUE if interleave high and interleave low permutations 4812 are supported, and FALSE otherwise. */ 4813 4814 bool 4815 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4816 { 4817 machine_mode mode = TYPE_MODE (vectype); 4818 4819 /* vect_permute_store_chain requires the group size to be equal to 3 or 4820 be a power of two. */ 4821 if (count != 3 && exact_log2 (count) == -1) 4822 { 4823 if (dump_enabled_p ()) 4824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4825 "the size of the group of accesses" 4826 " is not a power of 2 or not eqaul to 3\n"); 4827 return false; 4828 } 4829 4830 /* Check that the permutation is supported. */ 4831 if (VECTOR_MODE_P (mode)) 4832 { 4833 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4834 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4835 4836 if (count == 3) 4837 { 4838 unsigned int j0 = 0, j1 = 0, j2 = 0; 4839 unsigned int i, j; 4840 4841 for (j = 0; j < 3; j++) 4842 { 4843 int nelt0 = ((3 - j) * nelt) % 3; 4844 int nelt1 = ((3 - j) * nelt + 1) % 3; 4845 int nelt2 = ((3 - j) * nelt + 2) % 3; 4846 for (i = 0; i < nelt; i++) 4847 { 4848 if (3 * i + nelt0 < nelt) 4849 sel[3 * i + nelt0] = j0++; 4850 if (3 * i + nelt1 < nelt) 4851 sel[3 * i + nelt1] = nelt + j1++; 4852 if (3 * i + nelt2 < nelt) 4853 sel[3 * i + nelt2] = 0; 4854 } 4855 if (!can_vec_perm_p (mode, false, sel)) 4856 { 4857 if (dump_enabled_p ()) 4858 dump_printf (MSG_MISSED_OPTIMIZATION, 4859 "permutaion op not supported by target.\n"); 4860 return false; 4861 } 4862 4863 for (i = 0; i < nelt; i++) 4864 { 4865 if (3 * i + nelt0 < nelt) 4866 sel[3 * i + nelt0] = 3 * i + nelt0; 4867 if (3 * i + nelt1 < nelt) 4868 sel[3 * i + nelt1] = 3 * i + nelt1; 4869 if (3 * i + nelt2 < nelt) 4870 sel[3 * i + nelt2] = nelt + j2++; 4871 } 4872 if (!can_vec_perm_p (mode, false, sel)) 4873 { 4874 if (dump_enabled_p ()) 4875 dump_printf (MSG_MISSED_OPTIMIZATION, 4876 "permutaion op not supported by target.\n"); 4877 return false; 4878 } 4879 } 4880 return true; 4881 } 4882 else 4883 { 4884 /* If length is not equal to 3 then only power of 2 is supported. */ 4885 gcc_assert (pow2p_hwi (count)); 4886 4887 for (i = 0; i < nelt / 2; i++) 4888 { 4889 sel[i * 2] = i; 4890 sel[i * 2 + 1] = i + nelt; 4891 } 4892 if (can_vec_perm_p (mode, false, sel)) 4893 { 4894 for (i = 0; i < nelt; i++) 4895 sel[i] += nelt / 2; 4896 if (can_vec_perm_p (mode, false, sel)) 4897 return true; 4898 } 4899 } 4900 } 4901 4902 if (dump_enabled_p ()) 4903 dump_printf (MSG_MISSED_OPTIMIZATION, 4904 "permutaion op not supported by target.\n"); 4905 return false; 4906 } 4907 4908 4909 /* Return TRUE if vec_store_lanes is available for COUNT vectors of 4910 type VECTYPE. */ 4911 4912 bool 4913 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4914 { 4915 return vect_lanes_optab_supported_p ("vec_store_lanes", 4916 vec_store_lanes_optab, 4917 vectype, count); 4918 } 4919 4920 4921 /* Function vect_permute_store_chain. 4922 4923 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 4924 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 4925 the data correctly for the stores. Return the final references for stores 4926 in RESULT_CHAIN. 4927 4928 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4929 The input is 4 vectors each containing 8 elements. We assign a number to 4930 each element, the input sequence is: 4931 4932 1st vec: 0 1 2 3 4 5 6 7 4933 2nd vec: 8 9 10 11 12 13 14 15 4934 3rd vec: 16 17 18 19 20 21 22 23 4935 4th vec: 24 25 26 27 28 29 30 31 4936 4937 The output sequence should be: 4938 4939 1st vec: 0 8 16 24 1 9 17 25 4940 2nd vec: 2 10 18 26 3 11 19 27 4941 3rd vec: 4 12 20 28 5 13 21 30 4942 4th vec: 6 14 22 30 7 15 23 31 4943 4944 i.e., we interleave the contents of the four vectors in their order. 4945 4946 We use interleave_high/low instructions to create such output. The input of 4947 each interleave_high/low operation is two vectors: 4948 1st vec 2nd vec 4949 0 1 2 3 4 5 6 7 4950 the even elements of the result vector are obtained left-to-right from the 4951 high/low elements of the first vector. The odd elements of the result are 4952 obtained left-to-right from the high/low elements of the second vector. 4953 The output of interleave_high will be: 0 4 1 5 4954 and of interleave_low: 2 6 3 7 4955 4956 4957 The permutation is done in log LENGTH stages. In each stage interleave_high 4958 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 4959 where the first argument is taken from the first half of DR_CHAIN and the 4960 second argument from it's second half. 4961 In our example, 4962 4963 I1: interleave_high (1st vec, 3rd vec) 4964 I2: interleave_low (1st vec, 3rd vec) 4965 I3: interleave_high (2nd vec, 4th vec) 4966 I4: interleave_low (2nd vec, 4th vec) 4967 4968 The output for the first stage is: 4969 4970 I1: 0 16 1 17 2 18 3 19 4971 I2: 4 20 5 21 6 22 7 23 4972 I3: 8 24 9 25 10 26 11 27 4973 I4: 12 28 13 29 14 30 15 31 4974 4975 The output of the second stage, i.e. the final result is: 4976 4977 I1: 0 8 16 24 1 9 17 25 4978 I2: 2 10 18 26 3 11 19 27 4979 I3: 4 12 20 28 5 13 21 30 4980 I4: 6 14 22 30 7 15 23 31. */ 4981 4982 void 4983 vect_permute_store_chain (vec<tree> dr_chain, 4984 unsigned int length, 4985 gimple *stmt, 4986 gimple_stmt_iterator *gsi, 4987 vec<tree> *result_chain) 4988 { 4989 tree vect1, vect2, high, low; 4990 gimple *perm_stmt; 4991 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4992 tree perm_mask_low, perm_mask_high; 4993 tree data_ref; 4994 tree perm3_mask_low, perm3_mask_high; 4995 unsigned int i, n, log_length = exact_log2 (length); 4996 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); 4997 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4998 4999 result_chain->quick_grow (length); 5000 memcpy (result_chain->address (), dr_chain.address (), 5001 length * sizeof (tree)); 5002 5003 if (length == 3) 5004 { 5005 unsigned int j0 = 0, j1 = 0, j2 = 0; 5006 5007 for (j = 0; j < 3; j++) 5008 { 5009 int nelt0 = ((3 - j) * nelt) % 3; 5010 int nelt1 = ((3 - j) * nelt + 1) % 3; 5011 int nelt2 = ((3 - j) * nelt + 2) % 3; 5012 5013 for (i = 0; i < nelt; i++) 5014 { 5015 if (3 * i + nelt0 < nelt) 5016 sel[3 * i + nelt0] = j0++; 5017 if (3 * i + nelt1 < nelt) 5018 sel[3 * i + nelt1] = nelt + j1++; 5019 if (3 * i + nelt2 < nelt) 5020 sel[3 * i + nelt2] = 0; 5021 } 5022 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5023 5024 for (i = 0; i < nelt; i++) 5025 { 5026 if (3 * i + nelt0 < nelt) 5027 sel[3 * i + nelt0] = 3 * i + nelt0; 5028 if (3 * i + nelt1 < nelt) 5029 sel[3 * i + nelt1] = 3 * i + nelt1; 5030 if (3 * i + nelt2 < nelt) 5031 sel[3 * i + nelt2] = nelt + j2++; 5032 } 5033 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5034 5035 vect1 = dr_chain[0]; 5036 vect2 = dr_chain[1]; 5037 5038 /* Create interleaving stmt: 5039 low = VEC_PERM_EXPR <vect1, vect2, 5040 {j, nelt, *, j + 1, nelt + j + 1, *, 5041 j + 2, nelt + j + 2, *, ...}> */ 5042 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5043 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5044 vect2, perm3_mask_low); 5045 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5046 5047 vect1 = data_ref; 5048 vect2 = dr_chain[2]; 5049 /* Create interleaving stmt: 5050 low = VEC_PERM_EXPR <vect1, vect2, 5051 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5052 6, 7, nelt + j + 2, ...}> */ 5053 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5054 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5055 vect2, perm3_mask_high); 5056 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5057 (*result_chain)[j] = data_ref; 5058 } 5059 } 5060 else 5061 { 5062 /* If length is not equal to 3 then only power of 2 is supported. */ 5063 gcc_assert (pow2p_hwi (length)); 5064 5065 for (i = 0, n = nelt / 2; i < n; i++) 5066 { 5067 sel[i * 2] = i; 5068 sel[i * 2 + 1] = i + nelt; 5069 } 5070 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5071 5072 for (i = 0; i < nelt; i++) 5073 sel[i] += nelt / 2; 5074 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5075 5076 for (i = 0, n = log_length; i < n; i++) 5077 { 5078 for (j = 0; j < length/2; j++) 5079 { 5080 vect1 = dr_chain[j]; 5081 vect2 = dr_chain[j+length/2]; 5082 5083 /* Create interleaving stmt: 5084 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5085 ...}> */ 5086 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5087 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5088 vect2, perm_mask_high); 5089 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5090 (*result_chain)[2*j] = high; 5091 5092 /* Create interleaving stmt: 5093 low = VEC_PERM_EXPR <vect1, vect2, 5094 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5095 ...}> */ 5096 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5097 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5098 vect2, perm_mask_low); 5099 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5100 (*result_chain)[2*j+1] = low; 5101 } 5102 memcpy (dr_chain.address (), result_chain->address (), 5103 length * sizeof (tree)); 5104 } 5105 } 5106 } 5107 5108 /* Function vect_setup_realignment 5109 5110 This function is called when vectorizing an unaligned load using 5111 the dr_explicit_realign[_optimized] scheme. 5112 This function generates the following code at the loop prolog: 5113 5114 p = initial_addr; 5115 x msq_init = *(floor(p)); # prolog load 5116 realignment_token = call target_builtin; 5117 loop: 5118 x msq = phi (msq_init, ---) 5119 5120 The stmts marked with x are generated only for the case of 5121 dr_explicit_realign_optimized. 5122 5123 The code above sets up a new (vector) pointer, pointing to the first 5124 location accessed by STMT, and a "floor-aligned" load using that pointer. 5125 It also generates code to compute the "realignment-token" (if the relevant 5126 target hook was defined), and creates a phi-node at the loop-header bb 5127 whose arguments are the result of the prolog-load (created by this 5128 function) and the result of a load that takes place in the loop (to be 5129 created by the caller to this function). 5130 5131 For the case of dr_explicit_realign_optimized: 5132 The caller to this function uses the phi-result (msq) to create the 5133 realignment code inside the loop, and sets up the missing phi argument, 5134 as follows: 5135 loop: 5136 msq = phi (msq_init, lsq) 5137 lsq = *(floor(p')); # load in loop 5138 result = realign_load (msq, lsq, realignment_token); 5139 5140 For the case of dr_explicit_realign: 5141 loop: 5142 msq = *(floor(p)); # load in loop 5143 p' = p + (VS-1); 5144 lsq = *(floor(p')); # load in loop 5145 result = realign_load (msq, lsq, realignment_token); 5146 5147 Input: 5148 STMT - (scalar) load stmt to be vectorized. This load accesses 5149 a memory location that may be unaligned. 5150 BSI - place where new code is to be inserted. 5151 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5152 is used. 5153 5154 Output: 5155 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5156 target hook, if defined. 5157 Return value - the result of the loop-header phi node. */ 5158 5159 tree 5160 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5161 tree *realignment_token, 5162 enum dr_alignment_support alignment_support_scheme, 5163 tree init_addr, 5164 struct loop **at_loop) 5165 { 5166 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5167 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5168 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5169 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5170 struct loop *loop = NULL; 5171 edge pe = NULL; 5172 tree scalar_dest = gimple_assign_lhs (stmt); 5173 tree vec_dest; 5174 gimple *inc; 5175 tree ptr; 5176 tree data_ref; 5177 basic_block new_bb; 5178 tree msq_init = NULL_TREE; 5179 tree new_temp; 5180 gphi *phi_stmt; 5181 tree msq = NULL_TREE; 5182 gimple_seq stmts = NULL; 5183 bool inv_p; 5184 bool compute_in_loop = false; 5185 bool nested_in_vect_loop = false; 5186 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5187 struct loop *loop_for_initial_load = NULL; 5188 5189 if (loop_vinfo) 5190 { 5191 loop = LOOP_VINFO_LOOP (loop_vinfo); 5192 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5193 } 5194 5195 gcc_assert (alignment_support_scheme == dr_explicit_realign 5196 || alignment_support_scheme == dr_explicit_realign_optimized); 5197 5198 /* We need to generate three things: 5199 1. the misalignment computation 5200 2. the extra vector load (for the optimized realignment scheme). 5201 3. the phi node for the two vectors from which the realignment is 5202 done (for the optimized realignment scheme). */ 5203 5204 /* 1. Determine where to generate the misalignment computation. 5205 5206 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5207 calculation will be generated by this function, outside the loop (in the 5208 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5209 caller, inside the loop. 5210 5211 Background: If the misalignment remains fixed throughout the iterations of 5212 the loop, then both realignment schemes are applicable, and also the 5213 misalignment computation can be done outside LOOP. This is because we are 5214 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5215 are a multiple of VS (the Vector Size), and therefore the misalignment in 5216 different vectorized LOOP iterations is always the same. 5217 The problem arises only if the memory access is in an inner-loop nested 5218 inside LOOP, which is now being vectorized using outer-loop vectorization. 5219 This is the only case when the misalignment of the memory access may not 5220 remain fixed throughout the iterations of the inner-loop (as explained in 5221 detail in vect_supportable_dr_alignment). In this case, not only is the 5222 optimized realignment scheme not applicable, but also the misalignment 5223 computation (and generation of the realignment token that is passed to 5224 REALIGN_LOAD) have to be done inside the loop. 5225 5226 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5227 or not, which in turn determines if the misalignment is computed inside 5228 the inner-loop, or outside LOOP. */ 5229 5230 if (init_addr != NULL_TREE || !loop_vinfo) 5231 { 5232 compute_in_loop = true; 5233 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5234 } 5235 5236 5237 /* 2. Determine where to generate the extra vector load. 5238 5239 For the optimized realignment scheme, instead of generating two vector 5240 loads in each iteration, we generate a single extra vector load in the 5241 preheader of the loop, and in each iteration reuse the result of the 5242 vector load from the previous iteration. In case the memory access is in 5243 an inner-loop nested inside LOOP, which is now being vectorized using 5244 outer-loop vectorization, we need to determine whether this initial vector 5245 load should be generated at the preheader of the inner-loop, or can be 5246 generated at the preheader of LOOP. If the memory access has no evolution 5247 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5248 to be generated inside LOOP (in the preheader of the inner-loop). */ 5249 5250 if (nested_in_vect_loop) 5251 { 5252 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5253 bool invariant_in_outerloop = 5254 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5255 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5256 } 5257 else 5258 loop_for_initial_load = loop; 5259 if (at_loop) 5260 *at_loop = loop_for_initial_load; 5261 5262 if (loop_for_initial_load) 5263 pe = loop_preheader_edge (loop_for_initial_load); 5264 5265 /* 3. For the case of the optimized realignment, create the first vector 5266 load at the loop preheader. */ 5267 5268 if (alignment_support_scheme == dr_explicit_realign_optimized) 5269 { 5270 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5271 gassign *new_stmt; 5272 5273 gcc_assert (!compute_in_loop); 5274 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5275 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5276 NULL_TREE, &init_addr, NULL, &inc, 5277 true, &inv_p); 5278 if (TREE_CODE (ptr) == SSA_NAME) 5279 new_temp = copy_ssa_name (ptr); 5280 else 5281 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5282 new_stmt = gimple_build_assign 5283 (new_temp, BIT_AND_EXPR, ptr, 5284 build_int_cst (TREE_TYPE (ptr), 5285 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); 5286 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5287 gcc_assert (!new_bb); 5288 data_ref 5289 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5290 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5291 new_stmt = gimple_build_assign (vec_dest, data_ref); 5292 new_temp = make_ssa_name (vec_dest, new_stmt); 5293 gimple_assign_set_lhs (new_stmt, new_temp); 5294 if (pe) 5295 { 5296 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5297 gcc_assert (!new_bb); 5298 } 5299 else 5300 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5301 5302 msq_init = gimple_assign_lhs (new_stmt); 5303 } 5304 5305 /* 4. Create realignment token using a target builtin, if available. 5306 It is done either inside the containing loop, or before LOOP (as 5307 determined above). */ 5308 5309 if (targetm.vectorize.builtin_mask_for_load) 5310 { 5311 gcall *new_stmt; 5312 tree builtin_decl; 5313 5314 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5315 if (!init_addr) 5316 { 5317 /* Generate the INIT_ADDR computation outside LOOP. */ 5318 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5319 NULL_TREE, loop); 5320 if (loop) 5321 { 5322 pe = loop_preheader_edge (loop); 5323 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5324 gcc_assert (!new_bb); 5325 } 5326 else 5327 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5328 } 5329 5330 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5331 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5332 vec_dest = 5333 vect_create_destination_var (scalar_dest, 5334 gimple_call_return_type (new_stmt)); 5335 new_temp = make_ssa_name (vec_dest, new_stmt); 5336 gimple_call_set_lhs (new_stmt, new_temp); 5337 5338 if (compute_in_loop) 5339 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5340 else 5341 { 5342 /* Generate the misalignment computation outside LOOP. */ 5343 pe = loop_preheader_edge (loop); 5344 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5345 gcc_assert (!new_bb); 5346 } 5347 5348 *realignment_token = gimple_call_lhs (new_stmt); 5349 5350 /* The result of the CALL_EXPR to this builtin is determined from 5351 the value of the parameter and no global variables are touched 5352 which makes the builtin a "const" function. Requiring the 5353 builtin to have the "const" attribute makes it unnecessary 5354 to call mark_call_clobbered. */ 5355 gcc_assert (TREE_READONLY (builtin_decl)); 5356 } 5357 5358 if (alignment_support_scheme == dr_explicit_realign) 5359 return msq; 5360 5361 gcc_assert (!compute_in_loop); 5362 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5363 5364 5365 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5366 5367 pe = loop_preheader_edge (containing_loop); 5368 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5369 msq = make_ssa_name (vec_dest); 5370 phi_stmt = create_phi_node (msq, containing_loop->header); 5371 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5372 5373 return msq; 5374 } 5375 5376 5377 /* Function vect_grouped_load_supported. 5378 5379 COUNT is the size of the load group (the number of statements plus the 5380 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5381 only one statement, with a gap of COUNT - 1. 5382 5383 Returns true if a suitable permute exists. */ 5384 5385 bool 5386 vect_grouped_load_supported (tree vectype, bool single_element_p, 5387 unsigned HOST_WIDE_INT count) 5388 { 5389 machine_mode mode = TYPE_MODE (vectype); 5390 5391 /* If this is single-element interleaving with an element distance 5392 that leaves unused vector loads around punt - we at least create 5393 very sub-optimal code in that case (and blow up memory, 5394 see PR65518). */ 5395 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype)) 5396 { 5397 if (dump_enabled_p ()) 5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5399 "single-element interleaving not supported " 5400 "for not adjacent vector loads\n"); 5401 return false; 5402 } 5403 5404 /* vect_permute_load_chain requires the group size to be equal to 3 or 5405 be a power of two. */ 5406 if (count != 3 && exact_log2 (count) == -1) 5407 { 5408 if (dump_enabled_p ()) 5409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5410 "the size of the group of accesses" 5411 " is not a power of 2 or not equal to 3\n"); 5412 return false; 5413 } 5414 5415 /* Check that the permutation is supported. */ 5416 if (VECTOR_MODE_P (mode)) 5417 { 5418 unsigned int i, j, nelt = GET_MODE_NUNITS (mode); 5419 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5420 5421 if (count == 3) 5422 { 5423 unsigned int k; 5424 for (k = 0; k < 3; k++) 5425 { 5426 for (i = 0; i < nelt; i++) 5427 if (3 * i + k < 2 * nelt) 5428 sel[i] = 3 * i + k; 5429 else 5430 sel[i] = 0; 5431 if (!can_vec_perm_p (mode, false, sel)) 5432 { 5433 if (dump_enabled_p ()) 5434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5435 "shuffle of 3 loads is not supported by" 5436 " target\n"); 5437 return false; 5438 } 5439 for (i = 0, j = 0; i < nelt; i++) 5440 if (3 * i + k < 2 * nelt) 5441 sel[i] = i; 5442 else 5443 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5444 if (!can_vec_perm_p (mode, false, sel)) 5445 { 5446 if (dump_enabled_p ()) 5447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5448 "shuffle of 3 loads is not supported by" 5449 " target\n"); 5450 return false; 5451 } 5452 } 5453 return true; 5454 } 5455 else 5456 { 5457 /* If length is not equal to 3 then only power of 2 is supported. */ 5458 gcc_assert (pow2p_hwi (count)); 5459 for (i = 0; i < nelt; i++) 5460 sel[i] = i * 2; 5461 if (can_vec_perm_p (mode, false, sel)) 5462 { 5463 for (i = 0; i < nelt; i++) 5464 sel[i] = i * 2 + 1; 5465 if (can_vec_perm_p (mode, false, sel)) 5466 return true; 5467 } 5468 } 5469 } 5470 5471 if (dump_enabled_p ()) 5472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5473 "extract even/odd not supported by target\n"); 5474 return false; 5475 } 5476 5477 /* Return TRUE if vec_load_lanes is available for COUNT vectors of 5478 type VECTYPE. */ 5479 5480 bool 5481 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 5482 { 5483 return vect_lanes_optab_supported_p ("vec_load_lanes", 5484 vec_load_lanes_optab, 5485 vectype, count); 5486 } 5487 5488 /* Function vect_permute_load_chain. 5489 5490 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5491 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5492 the input data correctly. Return the final references for loads in 5493 RESULT_CHAIN. 5494 5495 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5496 The input is 4 vectors each containing 8 elements. We assign a number to each 5497 element, the input sequence is: 5498 5499 1st vec: 0 1 2 3 4 5 6 7 5500 2nd vec: 8 9 10 11 12 13 14 15 5501 3rd vec: 16 17 18 19 20 21 22 23 5502 4th vec: 24 25 26 27 28 29 30 31 5503 5504 The output sequence should be: 5505 5506 1st vec: 0 4 8 12 16 20 24 28 5507 2nd vec: 1 5 9 13 17 21 25 29 5508 3rd vec: 2 6 10 14 18 22 26 30 5509 4th vec: 3 7 11 15 19 23 27 31 5510 5511 i.e., the first output vector should contain the first elements of each 5512 interleaving group, etc. 5513 5514 We use extract_even/odd instructions to create such output. The input of 5515 each extract_even/odd operation is two vectors 5516 1st vec 2nd vec 5517 0 1 2 3 4 5 6 7 5518 5519 and the output is the vector of extracted even/odd elements. The output of 5520 extract_even will be: 0 2 4 6 5521 and of extract_odd: 1 3 5 7 5522 5523 5524 The permutation is done in log LENGTH stages. In each stage extract_even 5525 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5526 their order. In our example, 5527 5528 E1: extract_even (1st vec, 2nd vec) 5529 E2: extract_odd (1st vec, 2nd vec) 5530 E3: extract_even (3rd vec, 4th vec) 5531 E4: extract_odd (3rd vec, 4th vec) 5532 5533 The output for the first stage will be: 5534 5535 E1: 0 2 4 6 8 10 12 14 5536 E2: 1 3 5 7 9 11 13 15 5537 E3: 16 18 20 22 24 26 28 30 5538 E4: 17 19 21 23 25 27 29 31 5539 5540 In order to proceed and create the correct sequence for the next stage (or 5541 for the correct output, if the second stage is the last one, as in our 5542 example), we first put the output of extract_even operation and then the 5543 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5544 The input for the second stage is: 5545 5546 1st vec (E1): 0 2 4 6 8 10 12 14 5547 2nd vec (E3): 16 18 20 22 24 26 28 30 5548 3rd vec (E2): 1 3 5 7 9 11 13 15 5549 4th vec (E4): 17 19 21 23 25 27 29 31 5550 5551 The output of the second stage: 5552 5553 E1: 0 4 8 12 16 20 24 28 5554 E2: 2 6 10 14 18 22 26 30 5555 E3: 1 5 9 13 17 21 25 29 5556 E4: 3 7 11 15 19 23 27 31 5557 5558 And RESULT_CHAIN after reordering: 5559 5560 1st vec (E1): 0 4 8 12 16 20 24 28 5561 2nd vec (E3): 1 5 9 13 17 21 25 29 5562 3rd vec (E2): 2 6 10 14 18 22 26 30 5563 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5564 5565 static void 5566 vect_permute_load_chain (vec<tree> dr_chain, 5567 unsigned int length, 5568 gimple *stmt, 5569 gimple_stmt_iterator *gsi, 5570 vec<tree> *result_chain) 5571 { 5572 tree data_ref, first_vect, second_vect; 5573 tree perm_mask_even, perm_mask_odd; 5574 tree perm3_mask_low, perm3_mask_high; 5575 gimple *perm_stmt; 5576 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5577 unsigned int i, j, log_length = exact_log2 (length); 5578 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5579 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5580 5581 result_chain->quick_grow (length); 5582 memcpy (result_chain->address (), dr_chain.address (), 5583 length * sizeof (tree)); 5584 5585 if (length == 3) 5586 { 5587 unsigned int k; 5588 5589 for (k = 0; k < 3; k++) 5590 { 5591 for (i = 0; i < nelt; i++) 5592 if (3 * i + k < 2 * nelt) 5593 sel[i] = 3 * i + k; 5594 else 5595 sel[i] = 0; 5596 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5597 5598 for (i = 0, j = 0; i < nelt; i++) 5599 if (3 * i + k < 2 * nelt) 5600 sel[i] = i; 5601 else 5602 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5603 5604 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5605 5606 first_vect = dr_chain[0]; 5607 second_vect = dr_chain[1]; 5608 5609 /* Create interleaving stmt (low part of): 5610 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5611 ...}> */ 5612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5613 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5614 second_vect, perm3_mask_low); 5615 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5616 5617 /* Create interleaving stmt (high part of): 5618 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5619 ...}> */ 5620 first_vect = data_ref; 5621 second_vect = dr_chain[2]; 5622 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5623 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5624 second_vect, perm3_mask_high); 5625 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5626 (*result_chain)[k] = data_ref; 5627 } 5628 } 5629 else 5630 { 5631 /* If length is not equal to 3 then only power of 2 is supported. */ 5632 gcc_assert (pow2p_hwi (length)); 5633 5634 for (i = 0; i < nelt; ++i) 5635 sel[i] = i * 2; 5636 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); 5637 5638 for (i = 0; i < nelt; ++i) 5639 sel[i] = i * 2 + 1; 5640 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); 5641 5642 for (i = 0; i < log_length; i++) 5643 { 5644 for (j = 0; j < length; j += 2) 5645 { 5646 first_vect = dr_chain[j]; 5647 second_vect = dr_chain[j+1]; 5648 5649 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5650 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5651 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5652 first_vect, second_vect, 5653 perm_mask_even); 5654 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5655 (*result_chain)[j/2] = data_ref; 5656 5657 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5658 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5659 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5660 first_vect, second_vect, 5661 perm_mask_odd); 5662 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5663 (*result_chain)[j/2+length/2] = data_ref; 5664 } 5665 memcpy (dr_chain.address (), result_chain->address (), 5666 length * sizeof (tree)); 5667 } 5668 } 5669 } 5670 5671 /* Function vect_shift_permute_load_chain. 5672 5673 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 5674 sequence of stmts to reorder the input data accordingly. 5675 Return the final references for loads in RESULT_CHAIN. 5676 Return true if successed, false otherwise. 5677 5678 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 5679 The input is 3 vectors each containing 8 elements. We assign a 5680 number to each element, the input sequence is: 5681 5682 1st vec: 0 1 2 3 4 5 6 7 5683 2nd vec: 8 9 10 11 12 13 14 15 5684 3rd vec: 16 17 18 19 20 21 22 23 5685 5686 The output sequence should be: 5687 5688 1st vec: 0 3 6 9 12 15 18 21 5689 2nd vec: 1 4 7 10 13 16 19 22 5690 3rd vec: 2 5 8 11 14 17 20 23 5691 5692 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 5693 5694 First we shuffle all 3 vectors to get correct elements order: 5695 5696 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 5697 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 5698 3rd vec: (16 19 22) (17 20 23) (18 21) 5699 5700 Next we unite and shift vector 3 times: 5701 5702 1st step: 5703 shift right by 6 the concatenation of: 5704 "1st vec" and "2nd vec" 5705 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 5706 "2nd vec" and "3rd vec" 5707 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 5708 "3rd vec" and "1st vec" 5709 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 5710 | New vectors | 5711 5712 So that now new vectors are: 5713 5714 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 5715 2nd vec: (10 13) (16 19 22) (17 20 23) 5716 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 5717 5718 2nd step: 5719 shift right by 5 the concatenation of: 5720 "1st vec" and "3rd vec" 5721 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 5722 "2nd vec" and "1st vec" 5723 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 5724 "3rd vec" and "2nd vec" 5725 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 5726 | New vectors | 5727 5728 So that now new vectors are: 5729 5730 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 5731 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 5732 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 5733 5734 3rd step: 5735 shift right by 5 the concatenation of: 5736 "1st vec" and "1st vec" 5737 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 5738 shift right by 3 the concatenation of: 5739 "2nd vec" and "2nd vec" 5740 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 5741 | New vectors | 5742 5743 So that now all vectors are READY: 5744 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 5745 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 5746 3rd vec: ( 1 4 7) (10 13) (16 19 22) 5747 5748 This algorithm is faster than one in vect_permute_load_chain if: 5749 1. "shift of a concatination" is faster than general permutation. 5750 This is usually so. 5751 2. The TARGET machine can't execute vector instructions in parallel. 5752 This is because each step of the algorithm depends on previous. 5753 The algorithm in vect_permute_load_chain is much more parallel. 5754 5755 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 5756 */ 5757 5758 static bool 5759 vect_shift_permute_load_chain (vec<tree> dr_chain, 5760 unsigned int length, 5761 gimple *stmt, 5762 gimple_stmt_iterator *gsi, 5763 vec<tree> *result_chain) 5764 { 5765 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 5766 tree perm2_mask1, perm2_mask2, perm3_mask; 5767 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 5768 gimple *perm_stmt; 5769 5770 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5771 unsigned int i; 5772 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5773 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5774 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5775 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5776 5777 result_chain->quick_grow (length); 5778 memcpy (result_chain->address (), dr_chain.address (), 5779 length * sizeof (tree)); 5780 5781 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) 5782 { 5783 unsigned int j, log_length = exact_log2 (length); 5784 for (i = 0; i < nelt / 2; ++i) 5785 sel[i] = i * 2; 5786 for (i = 0; i < nelt / 2; ++i) 5787 sel[nelt / 2 + i] = i * 2 + 1; 5788 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5789 { 5790 if (dump_enabled_p ()) 5791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5792 "shuffle of 2 fields structure is not \ 5793 supported by target\n"); 5794 return false; 5795 } 5796 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); 5797 5798 for (i = 0; i < nelt / 2; ++i) 5799 sel[i] = i * 2 + 1; 5800 for (i = 0; i < nelt / 2; ++i) 5801 sel[nelt / 2 + i] = i * 2; 5802 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5803 { 5804 if (dump_enabled_p ()) 5805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5806 "shuffle of 2 fields structure is not \ 5807 supported by target\n"); 5808 return false; 5809 } 5810 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); 5811 5812 /* Generating permutation constant to shift all elements. 5813 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 5814 for (i = 0; i < nelt; i++) 5815 sel[i] = nelt / 2 + i; 5816 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5817 { 5818 if (dump_enabled_p ()) 5819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5820 "shift permutation is not supported by target\n"); 5821 return false; 5822 } 5823 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5824 5825 /* Generating permutation constant to select vector from 2. 5826 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 5827 for (i = 0; i < nelt / 2; i++) 5828 sel[i] = i; 5829 for (i = nelt / 2; i < nelt; i++) 5830 sel[i] = nelt + i; 5831 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5832 { 5833 if (dump_enabled_p ()) 5834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5835 "select is not supported by target\n"); 5836 return false; 5837 } 5838 select_mask = vect_gen_perm_mask_checked (vectype, sel); 5839 5840 for (i = 0; i < log_length; i++) 5841 { 5842 for (j = 0; j < length; j += 2) 5843 { 5844 first_vect = dr_chain[j]; 5845 second_vect = dr_chain[j + 1]; 5846 5847 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5848 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5849 first_vect, first_vect, 5850 perm2_mask1); 5851 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5852 vect[0] = data_ref; 5853 5854 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5855 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5856 second_vect, second_vect, 5857 perm2_mask2); 5858 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5859 vect[1] = data_ref; 5860 5861 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 5862 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5863 vect[0], vect[1], shift1_mask); 5864 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5865 (*result_chain)[j/2 + length/2] = data_ref; 5866 5867 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 5868 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5869 vect[0], vect[1], select_mask); 5870 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5871 (*result_chain)[j/2] = data_ref; 5872 } 5873 memcpy (dr_chain.address (), result_chain->address (), 5874 length * sizeof (tree)); 5875 } 5876 return true; 5877 } 5878 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) 5879 { 5880 unsigned int k = 0, l = 0; 5881 5882 /* Generating permutation constant to get all elements in rigth order. 5883 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 5884 for (i = 0; i < nelt; i++) 5885 { 5886 if (3 * k + (l % 3) >= nelt) 5887 { 5888 k = 0; 5889 l += (3 - (nelt % 3)); 5890 } 5891 sel[i] = 3 * k + (l % 3); 5892 k++; 5893 } 5894 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5895 { 5896 if (dump_enabled_p ()) 5897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5898 "shuffle of 3 fields structure is not \ 5899 supported by target\n"); 5900 return false; 5901 } 5902 perm3_mask = vect_gen_perm_mask_checked (vectype, sel); 5903 5904 /* Generating permutation constant to shift all elements. 5905 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 5906 for (i = 0; i < nelt; i++) 5907 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 5908 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5909 { 5910 if (dump_enabled_p ()) 5911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5912 "shift permutation is not supported by target\n"); 5913 return false; 5914 } 5915 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5916 5917 /* Generating permutation constant to shift all elements. 5918 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5919 for (i = 0; i < nelt; i++) 5920 sel[i] = 2 * (nelt / 3) + 1 + i; 5921 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5922 { 5923 if (dump_enabled_p ()) 5924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5925 "shift permutation is not supported by target\n"); 5926 return false; 5927 } 5928 shift2_mask = vect_gen_perm_mask_checked (vectype, sel); 5929 5930 /* Generating permutation constant to shift all elements. 5931 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 5932 for (i = 0; i < nelt; i++) 5933 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 5934 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5935 { 5936 if (dump_enabled_p ()) 5937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5938 "shift permutation is not supported by target\n"); 5939 return false; 5940 } 5941 shift3_mask = vect_gen_perm_mask_checked (vectype, sel); 5942 5943 /* Generating permutation constant to shift all elements. 5944 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5945 for (i = 0; i < nelt; i++) 5946 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 5947 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5948 { 5949 if (dump_enabled_p ()) 5950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5951 "shift permutation is not supported by target\n"); 5952 return false; 5953 } 5954 shift4_mask = vect_gen_perm_mask_checked (vectype, sel); 5955 5956 for (k = 0; k < 3; k++) 5957 { 5958 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 5959 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5960 dr_chain[k], dr_chain[k], 5961 perm3_mask); 5962 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5963 vect[k] = data_ref; 5964 } 5965 5966 for (k = 0; k < 3; k++) 5967 { 5968 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 5969 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5970 vect[k % 3], vect[(k + 1) % 3], 5971 shift1_mask); 5972 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5973 vect_shift[k] = data_ref; 5974 } 5975 5976 for (k = 0; k < 3; k++) 5977 { 5978 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 5979 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5980 vect_shift[(4 - k) % 3], 5981 vect_shift[(3 - k) % 3], 5982 shift2_mask); 5983 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5984 vect[k] = data_ref; 5985 } 5986 5987 (*result_chain)[3 - (nelt % 3)] = vect[2]; 5988 5989 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 5990 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 5991 vect[0], shift3_mask); 5992 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5993 (*result_chain)[nelt % 3] = data_ref; 5994 5995 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 5996 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 5997 vect[1], shift4_mask); 5998 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5999 (*result_chain)[0] = data_ref; 6000 return true; 6001 } 6002 return false; 6003 } 6004 6005 /* Function vect_transform_grouped_load. 6006 6007 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6008 to perform their permutation and ascribe the result vectorized statements to 6009 the scalar statements. 6010 */ 6011 6012 void 6013 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 6014 gimple_stmt_iterator *gsi) 6015 { 6016 machine_mode mode; 6017 vec<tree> result_chain = vNULL; 6018 6019 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6020 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6021 vectors, that are ready for vector computation. */ 6022 result_chain.create (size); 6023 6024 /* If reassociation width for vector type is 2 or greater target machine can 6025 execute 2 or more vector instructions in parallel. Otherwise try to 6026 get chain for loads group using vect_shift_permute_load_chain. */ 6027 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 6028 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6029 || pow2p_hwi (size) 6030 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 6031 gsi, &result_chain)) 6032 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 6033 vect_record_grouped_load_vectors (stmt, result_chain); 6034 result_chain.release (); 6035 } 6036 6037 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6038 generated as part of the vectorization of STMT. Assign the statement 6039 for each vector to the associated scalar statement. */ 6040 6041 void 6042 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6043 { 6044 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6045 gimple *next_stmt, *new_stmt; 6046 unsigned int i, gap_count; 6047 tree tmp_data_ref; 6048 6049 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6050 Since we scan the chain starting from it's first node, their order 6051 corresponds the order of data-refs in RESULT_CHAIN. */ 6052 next_stmt = first_stmt; 6053 gap_count = 1; 6054 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6055 { 6056 if (!next_stmt) 6057 break; 6058 6059 /* Skip the gaps. Loads created for the gaps will be removed by dead 6060 code elimination pass later. No need to check for the first stmt in 6061 the group, since it always exists. 6062 GROUP_GAP is the number of steps in elements from the previous 6063 access (if there is no gap GROUP_GAP is 1). We skip loads that 6064 correspond to the gaps. */ 6065 if (next_stmt != first_stmt 6066 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6067 { 6068 gap_count++; 6069 continue; 6070 } 6071 6072 while (next_stmt) 6073 { 6074 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6075 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6076 copies, and we put the new vector statement in the first available 6077 RELATED_STMT. */ 6078 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6079 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6080 else 6081 { 6082 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6083 { 6084 gimple *prev_stmt = 6085 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6086 gimple *rel_stmt = 6087 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6088 while (rel_stmt) 6089 { 6090 prev_stmt = rel_stmt; 6091 rel_stmt = 6092 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6093 } 6094 6095 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6096 new_stmt; 6097 } 6098 } 6099 6100 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6101 gap_count = 1; 6102 /* If NEXT_STMT accesses the same DR as the previous statement, 6103 put the same TMP_DATA_REF as its vectorized statement; otherwise 6104 get the next data-ref from RESULT_CHAIN. */ 6105 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6106 break; 6107 } 6108 } 6109 } 6110 6111 /* Function vect_force_dr_alignment_p. 6112 6113 Returns whether the alignment of a DECL can be forced to be aligned 6114 on ALIGNMENT bit boundary. */ 6115 6116 bool 6117 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6118 { 6119 if (!VAR_P (decl)) 6120 return false; 6121 6122 if (decl_in_symtab_p (decl) 6123 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6124 return false; 6125 6126 if (TREE_STATIC (decl)) 6127 return (alignment <= MAX_OFILE_ALIGNMENT); 6128 else 6129 return (alignment <= MAX_STACK_ALIGNMENT); 6130 } 6131 6132 6133 /* Return whether the data reference DR is supported with respect to its 6134 alignment. 6135 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6136 it is aligned, i.e., check if it is possible to vectorize it with different 6137 alignment. */ 6138 6139 enum dr_alignment_support 6140 vect_supportable_dr_alignment (struct data_reference *dr, 6141 bool check_aligned_accesses) 6142 { 6143 gimple *stmt = DR_STMT (dr); 6144 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6145 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6146 machine_mode mode = TYPE_MODE (vectype); 6147 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6148 struct loop *vect_loop = NULL; 6149 bool nested_in_vect_loop = false; 6150 6151 if (aligned_access_p (dr) && !check_aligned_accesses) 6152 return dr_aligned; 6153 6154 /* For now assume all conditional loads/stores support unaligned 6155 access without any special code. */ 6156 if (is_gimple_call (stmt) 6157 && gimple_call_internal_p (stmt) 6158 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6159 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6160 return dr_unaligned_supported; 6161 6162 if (loop_vinfo) 6163 { 6164 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6165 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6166 } 6167 6168 /* Possibly unaligned access. */ 6169 6170 /* We can choose between using the implicit realignment scheme (generating 6171 a misaligned_move stmt) and the explicit realignment scheme (generating 6172 aligned loads with a REALIGN_LOAD). There are two variants to the 6173 explicit realignment scheme: optimized, and unoptimized. 6174 We can optimize the realignment only if the step between consecutive 6175 vector loads is equal to the vector size. Since the vector memory 6176 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6177 is guaranteed that the misalignment amount remains the same throughout the 6178 execution of the vectorized loop. Therefore, we can create the 6179 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6180 at the loop preheader. 6181 6182 However, in the case of outer-loop vectorization, when vectorizing a 6183 memory access in the inner-loop nested within the LOOP that is now being 6184 vectorized, while it is guaranteed that the misalignment of the 6185 vectorized memory access will remain the same in different outer-loop 6186 iterations, it is *not* guaranteed that is will remain the same throughout 6187 the execution of the inner-loop. This is because the inner-loop advances 6188 with the original scalar step (and not in steps of VS). If the inner-loop 6189 step happens to be a multiple of VS, then the misalignment remains fixed 6190 and we can use the optimized realignment scheme. For example: 6191 6192 for (i=0; i<N; i++) 6193 for (j=0; j<M; j++) 6194 s += a[i+j]; 6195 6196 When vectorizing the i-loop in the above example, the step between 6197 consecutive vector loads is 1, and so the misalignment does not remain 6198 fixed across the execution of the inner-loop, and the realignment cannot 6199 be optimized (as illustrated in the following pseudo vectorized loop): 6200 6201 for (i=0; i<N; i+=4) 6202 for (j=0; j<M; j++){ 6203 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6204 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6205 // (assuming that we start from an aligned address). 6206 } 6207 6208 We therefore have to use the unoptimized realignment scheme: 6209 6210 for (i=0; i<N; i+=4) 6211 for (j=k; j<M; j+=4) 6212 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6213 // that the misalignment of the initial address is 6214 // 0). 6215 6216 The loop can then be vectorized as follows: 6217 6218 for (k=0; k<4; k++){ 6219 rt = get_realignment_token (&vp[k]); 6220 for (i=0; i<N; i+=4){ 6221 v1 = vp[i+k]; 6222 for (j=k; j<M; j+=4){ 6223 v2 = vp[i+j+VS-1]; 6224 va = REALIGN_LOAD <v1,v2,rt>; 6225 vs += va; 6226 v1 = v2; 6227 } 6228 } 6229 } */ 6230 6231 if (DR_IS_READ (dr)) 6232 { 6233 bool is_packed = false; 6234 tree type = (TREE_TYPE (DR_REF (dr))); 6235 6236 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6237 && (!targetm.vectorize.builtin_mask_for_load 6238 || targetm.vectorize.builtin_mask_for_load ())) 6239 { 6240 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6241 6242 /* If we are doing SLP then the accesses need not have the 6243 same alignment, instead it depends on the SLP group size. */ 6244 if (loop_vinfo 6245 && STMT_SLP_TYPE (stmt_info) 6246 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6247 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))) 6248 % TYPE_VECTOR_SUBPARTS (vectype) != 0)) 6249 ; 6250 else if (!loop_vinfo 6251 || (nested_in_vect_loop 6252 && (TREE_INT_CST_LOW (DR_STEP (dr)) 6253 != GET_MODE_SIZE (TYPE_MODE (vectype))))) 6254 return dr_explicit_realign; 6255 else 6256 return dr_explicit_realign_optimized; 6257 } 6258 if (!known_alignment_for_access_p (dr)) 6259 is_packed = not_size_aligned (DR_REF (dr)); 6260 6261 if (targetm.vectorize.support_vector_misalignment 6262 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6263 /* Can't software pipeline the loads, but can at least do them. */ 6264 return dr_unaligned_supported; 6265 } 6266 else 6267 { 6268 bool is_packed = false; 6269 tree type = (TREE_TYPE (DR_REF (dr))); 6270 6271 if (!known_alignment_for_access_p (dr)) 6272 is_packed = not_size_aligned (DR_REF (dr)); 6273 6274 if (targetm.vectorize.support_vector_misalignment 6275 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6276 return dr_unaligned_supported; 6277 } 6278 6279 /* Unsupported. */ 6280 return dr_unaligned_unsupported; 6281 } 6282