1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2017 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "rtl.h" 28 #include "tree.h" 29 #include "gimple.h" 30 #include "predict.h" 31 #include "memmodel.h" 32 #include "tm_p.h" 33 #include "ssa.h" 34 #include "optabs-tree.h" 35 #include "cgraph.h" 36 #include "dumpfile.h" 37 #include "alias.h" 38 #include "fold-const.h" 39 #include "stor-layout.h" 40 #include "tree-eh.h" 41 #include "gimplify.h" 42 #include "gimple-iterator.h" 43 #include "gimplify-me.h" 44 #include "tree-ssa-loop-ivopts.h" 45 #include "tree-ssa-loop-manip.h" 46 #include "tree-ssa-loop.h" 47 #include "cfgloop.h" 48 #include "tree-scalar-evolution.h" 49 #include "tree-vectorizer.h" 50 #include "expr.h" 51 #include "builtins.h" 52 #include "params.h" 53 54 /* Return true if load- or store-lanes optab OPTAB is implemented for 55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 56 57 static bool 58 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 59 tree vectype, unsigned HOST_WIDE_INT count) 60 { 61 machine_mode mode, array_mode; 62 bool limit_p; 63 64 mode = TYPE_MODE (vectype); 65 limit_p = !targetm.array_mode_supported_p (mode, count); 66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), 67 MODE_INT, limit_p); 68 69 if (array_mode == BLKmode) 70 { 71 if (dump_enabled_p ()) 72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n", 74 GET_MODE_NAME (mode), count); 75 return false; 76 } 77 78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 79 { 80 if (dump_enabled_p ()) 81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 82 "cannot use %s<%s><%s>\n", name, 83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 84 return false; 85 } 86 87 if (dump_enabled_p ()) 88 dump_printf_loc (MSG_NOTE, vect_location, 89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 90 GET_MODE_NAME (mode)); 91 92 return true; 93 } 94 95 96 /* Return the smallest scalar part of STMT. 97 This is used to determine the vectype of the stmt. We generally set the 98 vectype according to the type of the result (lhs). For stmts whose 99 result-type is different than the type of the arguments (e.g., demotion, 100 promotion), vectype will be reset appropriately (later). Note that we have 101 to visit the smallest datatype in this function, because that determines the 102 VF. If the smallest datatype in the loop is present only as the rhs of a 103 promotion operation - we'd miss it. 104 Such a case, where a variable of this datatype does not appear in the lhs 105 anywhere in the loop, can only occur if it's an invariant: e.g.: 106 'int_x = (int) short_inv', which we'd expect to have been optimized away by 107 invariant motion. However, we cannot rely on invariant motion to always 108 take invariants out of the loop, and so in the case of promotion we also 109 have to check the rhs. 110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 111 types. */ 112 113 tree 114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, 115 HOST_WIDE_INT *rhs_size_unit) 116 { 117 tree scalar_type = gimple_expr_type (stmt); 118 HOST_WIDE_INT lhs, rhs; 119 120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 121 122 if (is_gimple_assign (stmt) 123 && (gimple_assign_cast_p (stmt) 124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 127 { 128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 129 130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 131 if (rhs < lhs) 132 scalar_type = rhs_type; 133 } 134 135 *lhs_size_unit = lhs; 136 *rhs_size_unit = rhs; 137 return scalar_type; 138 } 139 140 141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 142 tested at run-time. Return TRUE if DDR was successfully inserted. 143 Return false if versioning is not supported. */ 144 145 static bool 146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 147 { 148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 149 150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 151 return false; 152 153 if (dump_enabled_p ()) 154 { 155 dump_printf_loc (MSG_NOTE, vect_location, 156 "mark for run-time aliasing test between "); 157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 158 dump_printf (MSG_NOTE, " and "); 159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 160 dump_printf (MSG_NOTE, "\n"); 161 } 162 163 if (optimize_loop_nest_for_size_p (loop)) 164 { 165 if (dump_enabled_p ()) 166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 167 "versioning not supported when optimizing" 168 " for size.\n"); 169 return false; 170 } 171 172 /* FORNOW: We don't support versioning with outer-loop vectorization. */ 173 if (loop->inner) 174 { 175 if (dump_enabled_p ()) 176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 177 "versioning not yet supported for outer-loops.\n"); 178 return false; 179 } 180 181 /* FORNOW: We don't support creating runtime alias tests for non-constant 182 step. */ 183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST 184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) 185 { 186 if (dump_enabled_p ()) 187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 188 "versioning not yet supported for non-constant " 189 "step\n"); 190 return false; 191 } 192 193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 194 return true; 195 } 196 197 198 /* Return true if we know that the order of vectorized STMT_A and 199 vectorized STMT_B will be the same as the order of STMT_A and STMT_B. 200 At least one of the statements is a write. */ 201 202 static bool 203 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b) 204 { 205 stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a); 206 stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b); 207 208 /* Single statements are always kept in their original order. */ 209 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 210 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 211 return true; 212 213 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 214 group are emitted at the position of the last scalar load and all 215 stores in a group are emitted at the position of the last scalar store. 216 Compute that position and check whether the resulting order matches 217 the current one. */ 218 gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 219 if (last_a) 220 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; 221 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 222 last_a = get_later_stmt (last_a, s); 223 else 224 last_a = stmt_a; 225 gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 226 if (last_b) 227 for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; 228 s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) 229 last_b = get_later_stmt (last_b, s); 230 else 231 last_b = stmt_b; 232 return ((get_later_stmt (last_a, last_b) == last_a) 233 == (get_later_stmt (stmt_a, stmt_b) == stmt_a)); 234 } 235 236 237 /* Function vect_analyze_data_ref_dependence. 238 239 Return TRUE if there (might) exist a dependence between a memory-reference 240 DRA and a memory-reference DRB. When versioning for alias may check a 241 dependence at run-time, return FALSE. Adjust *MAX_VF according to 242 the data dependence. */ 243 244 static bool 245 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 246 loop_vec_info loop_vinfo, int *max_vf) 247 { 248 unsigned int i; 249 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 250 struct data_reference *dra = DDR_A (ddr); 251 struct data_reference *drb = DDR_B (ddr); 252 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 253 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 254 lambda_vector dist_v; 255 unsigned int loop_depth; 256 257 /* In loop analysis all data references should be vectorizable. */ 258 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 259 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 260 gcc_unreachable (); 261 262 /* Independent data accesses. */ 263 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 264 return false; 265 266 if (dra == drb 267 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 268 return false; 269 270 /* We do not have to consider dependences between accesses that belong 271 to the same group. */ 272 if (GROUP_FIRST_ELEMENT (stmtinfo_a) 273 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)) 274 return false; 275 276 /* Even if we have an anti-dependence then, as the vectorized loop covers at 277 least two scalar iterations, there is always also a true dependence. 278 As the vectorizer does not re-order loads and stores we can ignore 279 the anti-dependence if TBAA can disambiguate both DRs similar to the 280 case with known negative distance anti-dependences (positive 281 distance anti-dependences would violate TBAA constraints). */ 282 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 283 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 284 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 285 get_alias_set (DR_REF (drb)))) 286 return false; 287 288 /* Unknown data dependence. */ 289 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 290 { 291 /* If user asserted safelen consecutive iterations can be 292 executed concurrently, assume independence. */ 293 if (loop->safelen >= 2) 294 { 295 if (loop->safelen < *max_vf) 296 *max_vf = loop->safelen; 297 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 298 return false; 299 } 300 301 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 302 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 303 { 304 if (dump_enabled_p ()) 305 { 306 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 307 "versioning for alias not supported for: " 308 "can't determine dependence between "); 309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 310 DR_REF (dra)); 311 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 312 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 313 DR_REF (drb)); 314 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 315 } 316 return true; 317 } 318 319 if (dump_enabled_p ()) 320 { 321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 322 "versioning for alias required: " 323 "can't determine dependence between "); 324 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 325 DR_REF (dra)); 326 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 327 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 328 DR_REF (drb)); 329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 330 } 331 332 /* Add to list of ddrs that need to be tested at run-time. */ 333 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 334 } 335 336 /* Known data dependence. */ 337 if (DDR_NUM_DIST_VECTS (ddr) == 0) 338 { 339 /* If user asserted safelen consecutive iterations can be 340 executed concurrently, assume independence. */ 341 if (loop->safelen >= 2) 342 { 343 if (loop->safelen < *max_vf) 344 *max_vf = loop->safelen; 345 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 346 return false; 347 } 348 349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 351 { 352 if (dump_enabled_p ()) 353 { 354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 355 "versioning for alias not supported for: " 356 "bad dist vector for "); 357 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 358 DR_REF (dra)); 359 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 360 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 361 DR_REF (drb)); 362 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 363 } 364 return true; 365 } 366 367 if (dump_enabled_p ()) 368 { 369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 370 "versioning for alias required: " 371 "bad dist vector for "); 372 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 373 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 374 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 375 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 376 } 377 /* Add to list of ddrs that need to be tested at run-time. */ 378 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 379 } 380 381 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 382 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 383 { 384 int dist = dist_v[loop_depth]; 385 386 if (dump_enabled_p ()) 387 dump_printf_loc (MSG_NOTE, vect_location, 388 "dependence distance = %d.\n", dist); 389 390 if (dist == 0) 391 { 392 if (dump_enabled_p ()) 393 { 394 dump_printf_loc (MSG_NOTE, vect_location, 395 "dependence distance == 0 between "); 396 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 397 dump_printf (MSG_NOTE, " and "); 398 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 399 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 400 } 401 402 /* When we perform grouped accesses and perform implicit CSE 403 by detecting equal accesses and doing disambiguation with 404 runtime alias tests like for 405 .. = a[i]; 406 .. = a[i+1]; 407 a[i] = ..; 408 a[i+1] = ..; 409 *p = ..; 410 .. = a[i]; 411 .. = a[i+1]; 412 where we will end up loading { a[i], a[i+1] } once, make 413 sure that inserting group loads before the first load and 414 stores after the last store will do the right thing. 415 Similar for groups like 416 a[i] = ...; 417 ... = a[i]; 418 a[i+1] = ...; 419 where loads from the group interleave with the store. */ 420 if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb))) 421 { 422 if (dump_enabled_p ()) 423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 424 "READ_WRITE dependence in interleaving." 425 "\n"); 426 return true; 427 } 428 429 unsigned int step_prec = TYPE_PRECISION (TREE_TYPE (DR_STEP (dra))); 430 if (loop->safelen < 2 431 && !expr_not_equal_to (DR_STEP (dra), wi::zero (step_prec))) 432 { 433 if (dump_enabled_p ()) 434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 435 "step could be zero.\n"); 436 return true; 437 } 438 439 continue; 440 } 441 442 if (dist > 0 && DDR_REVERSED_P (ddr)) 443 { 444 /* If DDR_REVERSED_P the order of the data-refs in DDR was 445 reversed (to make distance vector positive), and the actual 446 distance is negative. */ 447 if (dump_enabled_p ()) 448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 449 "dependence distance negative.\n"); 450 /* Record a negative dependence distance to later limit the 451 amount of stmt copying / unrolling we can perform. 452 Only need to handle read-after-write dependence. */ 453 if (DR_IS_READ (drb) 454 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 455 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 456 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 457 continue; 458 } 459 460 if (abs (dist) >= 2 461 && abs (dist) < *max_vf) 462 { 463 /* The dependence distance requires reduction of the maximal 464 vectorization factor. */ 465 *max_vf = abs (dist); 466 if (dump_enabled_p ()) 467 dump_printf_loc (MSG_NOTE, vect_location, 468 "adjusting maximal vectorization factor to %i\n", 469 *max_vf); 470 } 471 472 if (abs (dist) >= *max_vf) 473 { 474 /* Dependence distance does not create dependence, as far as 475 vectorization is concerned, in this case. */ 476 if (dump_enabled_p ()) 477 dump_printf_loc (MSG_NOTE, vect_location, 478 "dependence distance >= VF.\n"); 479 continue; 480 } 481 482 if (dump_enabled_p ()) 483 { 484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 485 "not vectorized, possible dependence " 486 "between data-refs "); 487 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 488 dump_printf (MSG_NOTE, " and "); 489 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 490 dump_printf (MSG_NOTE, "\n"); 491 } 492 493 return true; 494 } 495 496 return false; 497 } 498 499 /* Function vect_analyze_data_ref_dependences. 500 501 Examine all the data references in the loop, and make sure there do not 502 exist any data dependences between them. Set *MAX_VF according to 503 the maximum vectorization factor the data dependences allow. */ 504 505 bool 506 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf) 507 { 508 unsigned int i; 509 struct data_dependence_relation *ddr; 510 511 if (dump_enabled_p ()) 512 dump_printf_loc (MSG_NOTE, vect_location, 513 "=== vect_analyze_data_ref_dependences ===\n"); 514 515 LOOP_VINFO_DDRS (loop_vinfo) 516 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 517 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 518 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 519 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ 520 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 521 &LOOP_VINFO_DDRS (loop_vinfo), 522 LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) 523 return false; 524 525 /* For epilogues we either have no aliases or alias versioning 526 was applied to original loop. Therefore we may just get max_vf 527 using VF of original loop. */ 528 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 529 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo); 530 else 531 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 532 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 533 return false; 534 535 return true; 536 } 537 538 539 /* Function vect_slp_analyze_data_ref_dependence. 540 541 Return TRUE if there (might) exist a dependence between a memory-reference 542 DRA and a memory-reference DRB. When versioning for alias may check a 543 dependence at run-time, return FALSE. Adjust *MAX_VF according to 544 the data dependence. */ 545 546 static bool 547 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) 548 { 549 struct data_reference *dra = DDR_A (ddr); 550 struct data_reference *drb = DDR_B (ddr); 551 552 /* We need to check dependences of statements marked as unvectorizable 553 as well, they still can prohibit vectorization. */ 554 555 /* Independent data accesses. */ 556 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 557 return false; 558 559 if (dra == drb) 560 return false; 561 562 /* Read-read is OK. */ 563 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 564 return false; 565 566 /* If dra and drb are part of the same interleaving chain consider 567 them independent. */ 568 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) 569 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) 570 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) 571 return false; 572 573 /* Unknown data dependence. */ 574 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 575 { 576 if (dump_enabled_p ()) 577 { 578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 579 "can't determine dependence between "); 580 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 581 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 582 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 583 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 584 } 585 } 586 else if (dump_enabled_p ()) 587 { 588 dump_printf_loc (MSG_NOTE, vect_location, 589 "determined dependence between "); 590 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 591 dump_printf (MSG_NOTE, " and "); 592 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 593 dump_printf (MSG_NOTE, "\n"); 594 } 595 596 return true; 597 } 598 599 600 /* Analyze dependences involved in the transform of SLP NODE. STORES 601 contain the vector of scalar stores of this instance if we are 602 disambiguating the loads. */ 603 604 static bool 605 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 606 vec<gimple *> stores, gimple *last_store) 607 { 608 /* This walks over all stmts involved in the SLP load/store done 609 in NODE verifying we can sink them up to the last stmt in the 610 group. */ 611 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); 612 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 613 { 614 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; 615 if (access == last_access) 616 continue; 617 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); 618 for (gimple_stmt_iterator gsi = gsi_for_stmt (access); 619 gsi_stmt (gsi) != last_access; gsi_next (&gsi)) 620 { 621 gimple *stmt = gsi_stmt (gsi); 622 if (! gimple_vuse (stmt) 623 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 624 continue; 625 626 /* If we couldn't record a (single) data reference for this 627 stmt we have to give up. */ 628 /* ??? Here and below if dependence analysis fails we can resort 629 to the alias oracle which can handle more kinds of stmts. */ 630 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); 631 if (!dr_b) 632 return false; 633 634 bool dependent = false; 635 /* If we run into a store of this same instance (we've just 636 marked those) then delay dependence checking until we run 637 into the last store because this is where it will have 638 been sunk to (and we verify if we can do that as well). */ 639 if (gimple_visited_p (stmt)) 640 { 641 if (stmt != last_store) 642 continue; 643 unsigned i; 644 gimple *store; 645 FOR_EACH_VEC_ELT (stores, i, store) 646 { 647 data_reference *store_dr 648 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); 649 ddr_p ddr = initialize_data_dependence_relation 650 (dr_a, store_dr, vNULL); 651 dependent = vect_slp_analyze_data_ref_dependence (ddr); 652 free_dependence_relation (ddr); 653 if (dependent) 654 break; 655 } 656 } 657 else 658 { 659 ddr_p ddr = initialize_data_dependence_relation (dr_a, 660 dr_b, vNULL); 661 dependent = vect_slp_analyze_data_ref_dependence (ddr); 662 free_dependence_relation (ddr); 663 } 664 if (dependent) 665 return false; 666 } 667 } 668 return true; 669 } 670 671 672 /* Function vect_analyze_data_ref_dependences. 673 674 Examine all the data references in the basic-block, and make sure there 675 do not exist any data dependences between them. Set *MAX_VF according to 676 the maximum vectorization factor the data dependences allow. */ 677 678 bool 679 vect_slp_analyze_instance_dependence (slp_instance instance) 680 { 681 if (dump_enabled_p ()) 682 dump_printf_loc (MSG_NOTE, vect_location, 683 "=== vect_slp_analyze_instance_dependence ===\n"); 684 685 /* The stores of this instance are at the root of the SLP tree. */ 686 slp_tree store = SLP_INSTANCE_TREE (instance); 687 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) 688 store = NULL; 689 690 /* Verify we can sink stores to the vectorized stmt insert location. */ 691 gimple *last_store = NULL; 692 if (store) 693 { 694 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 695 return false; 696 697 /* Mark stores in this instance and remember the last one. */ 698 last_store = vect_find_last_scalar_stmt_in_slp (store); 699 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 700 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); 701 } 702 703 bool res = true; 704 705 /* Verify we can sink loads to the vectorized stmt insert location, 706 special-casing stores of this instance. */ 707 slp_tree load; 708 unsigned int i; 709 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 710 if (! vect_slp_analyze_node_dependences (instance, load, 711 store 712 ? SLP_TREE_SCALAR_STMTS (store) 713 : vNULL, last_store)) 714 { 715 res = false; 716 break; 717 } 718 719 /* Unset the visited flag. */ 720 if (store) 721 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 722 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); 723 724 return res; 725 } 726 727 /* Function vect_compute_data_ref_alignment 728 729 Compute the misalignment of the data reference DR. 730 731 Output: 732 1. If during the misalignment computation it is found that the data reference 733 cannot be vectorized then false is returned. 734 2. DR_MISALIGNMENT (DR) is defined. 735 736 FOR NOW: No analysis is actually performed. Misalignment is calculated 737 only for trivial cases. TODO. */ 738 739 bool 740 vect_compute_data_ref_alignment (struct data_reference *dr) 741 { 742 gimple *stmt = DR_STMT (dr); 743 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 744 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 745 struct loop *loop = NULL; 746 tree ref = DR_REF (dr); 747 tree vectype; 748 tree base, base_addr; 749 tree misalign = NULL_TREE; 750 tree aligned_to; 751 tree step; 752 unsigned HOST_WIDE_INT alignment; 753 754 if (dump_enabled_p ()) 755 dump_printf_loc (MSG_NOTE, vect_location, 756 "vect_compute_data_ref_alignment:\n"); 757 758 if (loop_vinfo) 759 loop = LOOP_VINFO_LOOP (loop_vinfo); 760 761 /* Initialize misalignment to unknown. */ 762 SET_DR_MISALIGNMENT (dr, -1); 763 764 if (tree_fits_shwi_p (DR_STEP (dr))) 765 misalign = DR_INIT (dr); 766 aligned_to = DR_ALIGNED_TO (dr); 767 base_addr = DR_BASE_ADDRESS (dr); 768 vectype = STMT_VINFO_VECTYPE (stmt_info); 769 770 /* In case the dataref is in an inner-loop of the loop that is being 771 vectorized (LOOP), we use the base and misalignment information 772 relative to the outer-loop (LOOP). This is ok only if the misalignment 773 stays the same throughout the execution of the inner-loop, which is why 774 we have to check that the stride of the dataref in the inner-loop evenly 775 divides by the vector size. */ 776 if (loop && nested_in_vect_loop_p (loop, stmt)) 777 { 778 tree step = DR_STEP (dr); 779 780 if (tree_fits_shwi_p (step) 781 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) 782 { 783 if (dump_enabled_p ()) 784 dump_printf_loc (MSG_NOTE, vect_location, 785 "inner step divides the vector-size.\n"); 786 misalign = STMT_VINFO_DR_INIT (stmt_info); 787 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); 788 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); 789 } 790 else 791 { 792 if (dump_enabled_p ()) 793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 794 "inner step doesn't divide the vector-size.\n"); 795 misalign = NULL_TREE; 796 } 797 } 798 799 /* Similarly we can only use base and misalignment information relative to 800 an innermost loop if the misalignment stays the same throughout the 801 execution of the loop. As above, this is the case if the stride of 802 the dataref evenly divides by the vector size. */ 803 else 804 { 805 tree step = DR_STEP (dr); 806 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; 807 808 if (tree_fits_shwi_p (step) 809 && ((tree_to_shwi (step) * vf) 810 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)) 811 { 812 if (dump_enabled_p ()) 813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 814 "step doesn't divide the vector-size.\n"); 815 misalign = NULL_TREE; 816 } 817 } 818 819 /* To look at alignment of the base we have to preserve an inner MEM_REF 820 as that carries alignment information of the actual access. */ 821 base = ref; 822 while (handled_component_p (base)) 823 base = TREE_OPERAND (base, 0); 824 unsigned int base_alignment = 0; 825 unsigned HOST_WIDE_INT base_bitpos; 826 get_object_alignment_1 (base, &base_alignment, &base_bitpos); 827 /* As data-ref analysis strips the MEM_REF down to its base operand 828 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to 829 adjust things to make base_alignment valid as the alignment of 830 DR_BASE_ADDRESS. */ 831 if (TREE_CODE (base) == MEM_REF) 832 { 833 /* Note all this only works if DR_BASE_ADDRESS is the same as 834 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored 835 in other offsets. We need to rework DR to compute the alingment 836 of DR_BASE_ADDRESS as long as all information is still available. */ 837 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0)) 838 { 839 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT; 840 base_bitpos &= (base_alignment - 1); 841 } 842 else 843 base_bitpos = BITS_PER_UNIT; 844 } 845 if (base_bitpos != 0) 846 base_alignment = base_bitpos & -base_bitpos; 847 /* Also look at the alignment of the base address DR analysis 848 computed. */ 849 unsigned int base_addr_alignment = get_pointer_alignment (base_addr); 850 if (base_addr_alignment > base_alignment) 851 base_alignment = base_addr_alignment; 852 853 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype))) 854 DR_VECT_AUX (dr)->base_element_aligned = true; 855 856 alignment = TYPE_ALIGN_UNIT (vectype); 857 858 if ((compare_tree_int (aligned_to, alignment) < 0) 859 || !misalign) 860 { 861 if (dump_enabled_p ()) 862 { 863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 864 "Unknown alignment for access: "); 865 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 866 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 867 } 868 return true; 869 } 870 871 if (base_alignment < TYPE_ALIGN (vectype)) 872 { 873 base = base_addr; 874 if (TREE_CODE (base) == ADDR_EXPR) 875 base = TREE_OPERAND (base, 0); 876 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))) 877 { 878 if (dump_enabled_p ()) 879 { 880 dump_printf_loc (MSG_NOTE, vect_location, 881 "can't force alignment of ref: "); 882 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 883 dump_printf (MSG_NOTE, "\n"); 884 } 885 return true; 886 } 887 888 if (DECL_USER_ALIGN (base)) 889 { 890 if (dump_enabled_p ()) 891 { 892 dump_printf_loc (MSG_NOTE, vect_location, 893 "not forcing alignment of user-aligned " 894 "variable: "); 895 dump_generic_expr (MSG_NOTE, TDF_SLIM, base); 896 dump_printf (MSG_NOTE, "\n"); 897 } 898 return true; 899 } 900 901 /* Force the alignment of the decl. 902 NOTE: This is the only change to the code we make during 903 the analysis phase, before deciding to vectorize the loop. */ 904 if (dump_enabled_p ()) 905 { 906 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 907 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 908 dump_printf (MSG_NOTE, "\n"); 909 } 910 911 DR_VECT_AUX (dr)->base_decl = base; 912 DR_VECT_AUX (dr)->base_misaligned = true; 913 DR_VECT_AUX (dr)->base_element_aligned = true; 914 } 915 916 if (loop && nested_in_vect_loop_p (loop, stmt)) 917 step = STMT_VINFO_DR_STEP (stmt_info); 918 else 919 step = DR_STEP (dr); 920 /* If this is a backward running DR then first access in the larger 921 vectype actually is N-1 elements before the address in the DR. 922 Adjust misalign accordingly. */ 923 if (tree_int_cst_sgn (step) < 0) 924 { 925 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); 926 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, 927 otherwise we wouldn't be here. */ 928 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step); 929 /* PLUS because STEP was negative. */ 930 misalign = size_binop (PLUS_EXPR, misalign, offset); 931 } 932 933 SET_DR_MISALIGNMENT (dr, 934 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ()); 935 936 if (dump_enabled_p ()) 937 { 938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 939 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 940 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 941 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 942 } 943 944 return true; 945 } 946 947 948 /* Function vect_update_misalignment_for_peel 949 950 DR - the data reference whose misalignment is to be adjusted. 951 DR_PEEL - the data reference whose misalignment is being made 952 zero in the vector loop by the peel. 953 NPEEL - the number of iterations in the peel loop if the misalignment 954 of DR_PEEL is known at compile time. */ 955 956 static void 957 vect_update_misalignment_for_peel (struct data_reference *dr, 958 struct data_reference *dr_peel, int npeel) 959 { 960 unsigned int i; 961 vec<dr_p> same_align_drs; 962 struct data_reference *current_dr; 963 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 964 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); 965 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 966 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 967 968 /* For interleaved data accesses the step in the loop must be multiplied by 969 the size of the interleaving group. */ 970 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 971 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 972 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 973 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 974 975 /* It can be assumed that the data refs with the same alignment as dr_peel 976 are aligned in the vector loop. */ 977 same_align_drs 978 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 979 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) 980 { 981 if (current_dr != dr) 982 continue; 983 gcc_assert (DR_MISALIGNMENT (dr) / dr_size == 984 DR_MISALIGNMENT (dr_peel) / dr_peel_size); 985 SET_DR_MISALIGNMENT (dr, 0); 986 return; 987 } 988 989 if (known_alignment_for_access_p (dr) 990 && known_alignment_for_access_p (dr_peel)) 991 { 992 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 993 int misal = DR_MISALIGNMENT (dr); 994 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 995 misal += negative ? -npeel * dr_size : npeel * dr_size; 996 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; 997 SET_DR_MISALIGNMENT (dr, misal); 998 return; 999 } 1000 1001 if (dump_enabled_p ()) 1002 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n"); 1003 SET_DR_MISALIGNMENT (dr, -1); 1004 } 1005 1006 1007 /* Function verify_data_ref_alignment 1008 1009 Return TRUE if DR can be handled with respect to alignment. */ 1010 1011 static bool 1012 verify_data_ref_alignment (data_reference_p dr) 1013 { 1014 enum dr_alignment_support supportable_dr_alignment 1015 = vect_supportable_dr_alignment (dr, false); 1016 if (!supportable_dr_alignment) 1017 { 1018 if (dump_enabled_p ()) 1019 { 1020 if (DR_IS_READ (dr)) 1021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1022 "not vectorized: unsupported unaligned load."); 1023 else 1024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1025 "not vectorized: unsupported unaligned " 1026 "store."); 1027 1028 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1029 DR_REF (dr)); 1030 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 1031 } 1032 return false; 1033 } 1034 1035 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1036 dump_printf_loc (MSG_NOTE, vect_location, 1037 "Vectorizing an unaligned access.\n"); 1038 1039 return true; 1040 } 1041 1042 /* Function vect_verify_datarefs_alignment 1043 1044 Return TRUE if all data references in the loop can be 1045 handled with respect to alignment. */ 1046 1047 bool 1048 vect_verify_datarefs_alignment (loop_vec_info vinfo) 1049 { 1050 vec<data_reference_p> datarefs = vinfo->datarefs; 1051 struct data_reference *dr; 1052 unsigned int i; 1053 1054 FOR_EACH_VEC_ELT (datarefs, i, dr) 1055 { 1056 gimple *stmt = DR_STMT (dr); 1057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1058 1059 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1060 continue; 1061 1062 /* For interleaving, only the alignment of the first access matters. */ 1063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1064 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1065 continue; 1066 1067 /* Strided accesses perform only component accesses, alignment is 1068 irrelevant for them. */ 1069 if (STMT_VINFO_STRIDED_P (stmt_info) 1070 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1071 continue; 1072 1073 if (! verify_data_ref_alignment (dr)) 1074 return false; 1075 } 1076 1077 return true; 1078 } 1079 1080 /* Given an memory reference EXP return whether its alignment is less 1081 than its size. */ 1082 1083 static bool 1084 not_size_aligned (tree exp) 1085 { 1086 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1087 return true; 1088 1089 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1090 > get_object_alignment (exp)); 1091 } 1092 1093 /* Function vector_alignment_reachable_p 1094 1095 Return true if vector alignment for DR is reachable by peeling 1096 a few loop iterations. Return false otherwise. */ 1097 1098 static bool 1099 vector_alignment_reachable_p (struct data_reference *dr) 1100 { 1101 gimple *stmt = DR_STMT (dr); 1102 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1103 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1104 1105 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1106 { 1107 /* For interleaved access we peel only if number of iterations in 1108 the prolog loop ({VF - misalignment}), is a multiple of the 1109 number of the interleaved accesses. */ 1110 int elem_size, mis_in_elements; 1111 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 1112 1113 /* FORNOW: handle only known alignment. */ 1114 if (!known_alignment_for_access_p (dr)) 1115 return false; 1116 1117 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; 1118 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1119 1120 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) 1121 return false; 1122 } 1123 1124 /* If misalignment is known at the compile time then allow peeling 1125 only if natural alignment is reachable through peeling. */ 1126 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1127 { 1128 HOST_WIDE_INT elmsize = 1129 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1130 if (dump_enabled_p ()) 1131 { 1132 dump_printf_loc (MSG_NOTE, vect_location, 1133 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1134 dump_printf (MSG_NOTE, 1135 ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); 1136 } 1137 if (DR_MISALIGNMENT (dr) % elmsize) 1138 { 1139 if (dump_enabled_p ()) 1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1141 "data size does not divide the misalignment.\n"); 1142 return false; 1143 } 1144 } 1145 1146 if (!known_alignment_for_access_p (dr)) 1147 { 1148 tree type = TREE_TYPE (DR_REF (dr)); 1149 bool is_packed = not_size_aligned (DR_REF (dr)); 1150 if (dump_enabled_p ()) 1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1152 "Unknown misalignment, %snaturally aligned\n", 1153 is_packed ? "not " : ""); 1154 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1155 } 1156 1157 return true; 1158 } 1159 1160 1161 /* Calculate the cost of the memory access represented by DR. */ 1162 1163 static void 1164 vect_get_data_access_cost (struct data_reference *dr, 1165 unsigned int *inside_cost, 1166 unsigned int *outside_cost, 1167 stmt_vector_for_cost *body_cost_vec) 1168 { 1169 gimple *stmt = DR_STMT (dr); 1170 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1171 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1172 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1173 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1174 int ncopies = vf / nunits; 1175 1176 if (DR_IS_READ (dr)) 1177 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1178 NULL, body_cost_vec, false); 1179 else 1180 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1181 1182 if (dump_enabled_p ()) 1183 dump_printf_loc (MSG_NOTE, vect_location, 1184 "vect_get_data_access_cost: inside_cost = %d, " 1185 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1186 } 1187 1188 1189 typedef struct _vect_peel_info 1190 { 1191 struct data_reference *dr; 1192 int npeel; 1193 unsigned int count; 1194 } *vect_peel_info; 1195 1196 typedef struct _vect_peel_extended_info 1197 { 1198 struct _vect_peel_info peel_info; 1199 unsigned int inside_cost; 1200 unsigned int outside_cost; 1201 stmt_vector_for_cost body_cost_vec; 1202 } *vect_peel_extended_info; 1203 1204 1205 /* Peeling hashtable helpers. */ 1206 1207 struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1208 { 1209 static inline hashval_t hash (const _vect_peel_info *); 1210 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1211 }; 1212 1213 inline hashval_t 1214 peel_info_hasher::hash (const _vect_peel_info *peel_info) 1215 { 1216 return (hashval_t) peel_info->npeel; 1217 } 1218 1219 inline bool 1220 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1221 { 1222 return (a->npeel == b->npeel); 1223 } 1224 1225 1226 /* Insert DR into peeling hash table with NPEEL as key. */ 1227 1228 static void 1229 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1230 loop_vec_info loop_vinfo, struct data_reference *dr, 1231 int npeel) 1232 { 1233 struct _vect_peel_info elem, *slot; 1234 _vect_peel_info **new_slot; 1235 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1236 1237 elem.npeel = npeel; 1238 slot = peeling_htab->find (&elem); 1239 if (slot) 1240 slot->count++; 1241 else 1242 { 1243 slot = XNEW (struct _vect_peel_info); 1244 slot->npeel = npeel; 1245 slot->dr = dr; 1246 slot->count = 1; 1247 new_slot = peeling_htab->find_slot (slot, INSERT); 1248 *new_slot = slot; 1249 } 1250 1251 if (!supportable_dr_alignment 1252 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1253 slot->count += VECT_MAX_COST; 1254 } 1255 1256 1257 /* Traverse peeling hash table to find peeling option that aligns maximum 1258 number of data accesses. */ 1259 1260 int 1261 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1262 _vect_peel_extended_info *max) 1263 { 1264 vect_peel_info elem = *slot; 1265 1266 if (elem->count > max->peel_info.count 1267 || (elem->count == max->peel_info.count 1268 && max->peel_info.npeel > elem->npeel)) 1269 { 1270 max->peel_info.npeel = elem->npeel; 1271 max->peel_info.count = elem->count; 1272 max->peel_info.dr = elem->dr; 1273 } 1274 1275 return 1; 1276 } 1277 1278 1279 /* Traverse peeling hash table and calculate cost for each peeling option. 1280 Find the one with the lowest cost. */ 1281 1282 int 1283 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1284 _vect_peel_extended_info *min) 1285 { 1286 vect_peel_info elem = *slot; 1287 int save_misalignment, dummy; 1288 unsigned int inside_cost = 0, outside_cost = 0, i; 1289 gimple *stmt = DR_STMT (elem->dr); 1290 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1291 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1292 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1293 struct data_reference *dr; 1294 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; 1295 1296 prologue_cost_vec.create (2); 1297 body_cost_vec.create (2); 1298 epilogue_cost_vec.create (2); 1299 1300 FOR_EACH_VEC_ELT (datarefs, i, dr) 1301 { 1302 stmt = DR_STMT (dr); 1303 stmt_info = vinfo_for_stmt (stmt); 1304 /* For interleaving, only the alignment of the first access 1305 matters. */ 1306 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1307 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1308 continue; 1309 1310 /* Strided accesses perform only component accesses, alignment is 1311 irrelevant for them. */ 1312 if (STMT_VINFO_STRIDED_P (stmt_info) 1313 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1314 continue; 1315 1316 save_misalignment = DR_MISALIGNMENT (dr); 1317 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); 1318 vect_get_data_access_cost (dr, &inside_cost, &outside_cost, 1319 &body_cost_vec); 1320 SET_DR_MISALIGNMENT (dr, save_misalignment); 1321 } 1322 1323 outside_cost += vect_get_known_peeling_cost 1324 (loop_vinfo, elem->npeel, &dummy, 1325 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1326 &prologue_cost_vec, &epilogue_cost_vec); 1327 1328 /* Prologue and epilogue costs are added to the target model later. 1329 These costs depend only on the scalar iteration cost, the 1330 number of peeling iterations finally chosen, and the number of 1331 misaligned statements. So discard the information found here. */ 1332 prologue_cost_vec.release (); 1333 epilogue_cost_vec.release (); 1334 1335 if (inside_cost < min->inside_cost 1336 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) 1337 { 1338 min->inside_cost = inside_cost; 1339 min->outside_cost = outside_cost; 1340 min->body_cost_vec.release (); 1341 min->body_cost_vec = body_cost_vec; 1342 min->peel_info.dr = elem->dr; 1343 min->peel_info.npeel = elem->npeel; 1344 } 1345 else 1346 body_cost_vec.release (); 1347 1348 return 1; 1349 } 1350 1351 1352 /* Choose best peeling option by traversing peeling hash table and either 1353 choosing an option with the lowest cost (if cost model is enabled) or the 1354 option that aligns as many accesses as possible. */ 1355 1356 static struct data_reference * 1357 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1358 loop_vec_info loop_vinfo, 1359 unsigned int *npeel, 1360 stmt_vector_for_cost *body_cost_vec) 1361 { 1362 struct _vect_peel_extended_info res; 1363 1364 res.peel_info.dr = NULL; 1365 res.body_cost_vec = stmt_vector_for_cost (); 1366 1367 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1368 { 1369 res.inside_cost = INT_MAX; 1370 res.outside_cost = INT_MAX; 1371 peeling_htab->traverse <_vect_peel_extended_info *, 1372 vect_peeling_hash_get_lowest_cost> (&res); 1373 } 1374 else 1375 { 1376 res.peel_info.count = 0; 1377 peeling_htab->traverse <_vect_peel_extended_info *, 1378 vect_peeling_hash_get_most_frequent> (&res); 1379 } 1380 1381 *npeel = res.peel_info.npeel; 1382 *body_cost_vec = res.body_cost_vec; 1383 return res.peel_info.dr; 1384 } 1385 1386 1387 /* Function vect_enhance_data_refs_alignment 1388 1389 This pass will use loop versioning and loop peeling in order to enhance 1390 the alignment of data references in the loop. 1391 1392 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1393 original loop is to be vectorized. Any other loops that are created by 1394 the transformations performed in this pass - are not supposed to be 1395 vectorized. This restriction will be relaxed. 1396 1397 This pass will require a cost model to guide it whether to apply peeling 1398 or versioning or a combination of the two. For example, the scheme that 1399 intel uses when given a loop with several memory accesses, is as follows: 1400 choose one memory access ('p') which alignment you want to force by doing 1401 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1402 other accesses are not necessarily aligned, or (2) use loop versioning to 1403 generate one loop in which all accesses are aligned, and another loop in 1404 which only 'p' is necessarily aligned. 1405 1406 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1407 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1408 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1409 1410 Devising a cost model is the most critical aspect of this work. It will 1411 guide us on which access to peel for, whether to use loop versioning, how 1412 many versions to create, etc. The cost model will probably consist of 1413 generic considerations as well as target specific considerations (on 1414 powerpc for example, misaligned stores are more painful than misaligned 1415 loads). 1416 1417 Here are the general steps involved in alignment enhancements: 1418 1419 -- original loop, before alignment analysis: 1420 for (i=0; i<N; i++){ 1421 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1422 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1423 } 1424 1425 -- After vect_compute_data_refs_alignment: 1426 for (i=0; i<N; i++){ 1427 x = q[i]; # DR_MISALIGNMENT(q) = 3 1428 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1429 } 1430 1431 -- Possibility 1: we do loop versioning: 1432 if (p is aligned) { 1433 for (i=0; i<N; i++){ # loop 1A 1434 x = q[i]; # DR_MISALIGNMENT(q) = 3 1435 p[i] = y; # DR_MISALIGNMENT(p) = 0 1436 } 1437 } 1438 else { 1439 for (i=0; i<N; i++){ # loop 1B 1440 x = q[i]; # DR_MISALIGNMENT(q) = 3 1441 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1442 } 1443 } 1444 1445 -- Possibility 2: we do loop peeling: 1446 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1447 x = q[i]; 1448 p[i] = y; 1449 } 1450 for (i = 3; i < N; i++){ # loop 2A 1451 x = q[i]; # DR_MISALIGNMENT(q) = 0 1452 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1453 } 1454 1455 -- Possibility 3: combination of loop peeling and versioning: 1456 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1457 x = q[i]; 1458 p[i] = y; 1459 } 1460 if (p is aligned) { 1461 for (i = 3; i<N; i++){ # loop 3A 1462 x = q[i]; # DR_MISALIGNMENT(q) = 0 1463 p[i] = y; # DR_MISALIGNMENT(p) = 0 1464 } 1465 } 1466 else { 1467 for (i = 3; i<N; i++){ # loop 3B 1468 x = q[i]; # DR_MISALIGNMENT(q) = 0 1469 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1470 } 1471 } 1472 1473 These loops are later passed to loop_transform to be vectorized. The 1474 vectorizer will use the alignment information to guide the transformation 1475 (whether to generate regular loads/stores, or with special handling for 1476 misalignment). */ 1477 1478 bool 1479 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1480 { 1481 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1482 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1483 enum dr_alignment_support supportable_dr_alignment; 1484 struct data_reference *dr0 = NULL, *first_store = NULL; 1485 struct data_reference *dr; 1486 unsigned int i, j; 1487 bool do_peeling = false; 1488 bool do_versioning = false; 1489 bool stat; 1490 gimple *stmt; 1491 stmt_vec_info stmt_info; 1492 unsigned int npeel = 0; 1493 bool all_misalignments_unknown = true; 1494 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1495 unsigned possible_npeel_number = 1; 1496 tree vectype; 1497 unsigned int nelements, mis, same_align_drs_max = 0; 1498 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); 1499 hash_table<peel_info_hasher> peeling_htab (1); 1500 1501 if (dump_enabled_p ()) 1502 dump_printf_loc (MSG_NOTE, vect_location, 1503 "=== vect_enhance_data_refs_alignment ===\n"); 1504 1505 /* Reset data so we can safely be called multiple times. */ 1506 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1507 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1508 1509 /* While cost model enhancements are expected in the future, the high level 1510 view of the code at this time is as follows: 1511 1512 A) If there is a misaligned access then see if peeling to align 1513 this access can make all data references satisfy 1514 vect_supportable_dr_alignment. If so, update data structures 1515 as needed and return true. 1516 1517 B) If peeling wasn't possible and there is a data reference with an 1518 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1519 then see if loop versioning checks can be used to make all data 1520 references satisfy vect_supportable_dr_alignment. If so, update 1521 data structures as needed and return true. 1522 1523 C) If neither peeling nor versioning were successful then return false if 1524 any data reference does not satisfy vect_supportable_dr_alignment. 1525 1526 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1527 1528 Note, Possibility 3 above (which is peeling and versioning together) is not 1529 being done at this time. */ 1530 1531 /* (1) Peeling to force alignment. */ 1532 1533 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1534 Considerations: 1535 + How many accesses will become aligned due to the peeling 1536 - How many accesses will become unaligned due to the peeling, 1537 and the cost of misaligned accesses. 1538 - The cost of peeling (the extra runtime checks, the increase 1539 in code size). */ 1540 1541 FOR_EACH_VEC_ELT (datarefs, i, dr) 1542 { 1543 stmt = DR_STMT (dr); 1544 stmt_info = vinfo_for_stmt (stmt); 1545 1546 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1547 continue; 1548 1549 /* For interleaving, only the alignment of the first access 1550 matters. */ 1551 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1552 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1553 continue; 1554 1555 /* For invariant accesses there is nothing to enhance. */ 1556 if (integer_zerop (DR_STEP (dr))) 1557 continue; 1558 1559 /* Strided accesses perform only component accesses, alignment is 1560 irrelevant for them. */ 1561 if (STMT_VINFO_STRIDED_P (stmt_info) 1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1563 continue; 1564 1565 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1566 do_peeling = vector_alignment_reachable_p (dr); 1567 if (do_peeling) 1568 { 1569 if (known_alignment_for_access_p (dr)) 1570 { 1571 unsigned int npeel_tmp; 1572 bool negative = tree_int_cst_compare (DR_STEP (dr), 1573 size_zero_node) < 0; 1574 1575 /* Save info about DR in the hash table. */ 1576 vectype = STMT_VINFO_VECTYPE (stmt_info); 1577 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1578 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( 1579 TREE_TYPE (DR_REF (dr)))); 1580 npeel_tmp = (negative 1581 ? (mis - nelements) : (nelements - mis)) 1582 & (nelements - 1); 1583 1584 /* For multiple types, it is possible that the bigger type access 1585 will have more than one peeling option. E.g., a loop with two 1586 types: one of size (vector size / 4), and the other one of 1587 size (vector size / 8). Vectorization factor will 8. If both 1588 access are misaligned by 3, the first one needs one scalar 1589 iteration to be aligned, and the second one needs 5. But the 1590 first one will be aligned also by peeling 5 scalar 1591 iterations, and in that case both accesses will be aligned. 1592 Hence, except for the immediate peeling amount, we also want 1593 to try to add full vector size, while we don't exceed 1594 vectorization factor. 1595 We do this automatically for cost model, since we calculate cost 1596 for every peeling option. */ 1597 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1598 { 1599 if (STMT_SLP_TYPE (stmt_info)) 1600 possible_npeel_number 1601 = (vf * GROUP_SIZE (stmt_info)) / nelements; 1602 else 1603 possible_npeel_number = vf / nelements; 1604 } 1605 1606 /* Handle the aligned case. We may decide to align some other 1607 access, making DR unaligned. */ 1608 if (DR_MISALIGNMENT (dr) == 0) 1609 { 1610 npeel_tmp = 0; 1611 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1612 possible_npeel_number++; 1613 } 1614 1615 for (j = 0; j < possible_npeel_number; j++) 1616 { 1617 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1618 dr, npeel_tmp); 1619 npeel_tmp += nelements; 1620 } 1621 1622 all_misalignments_unknown = false; 1623 /* Data-ref that was chosen for the case that all the 1624 misalignments are unknown is not relevant anymore, since we 1625 have a data-ref with known alignment. */ 1626 dr0 = NULL; 1627 } 1628 else 1629 { 1630 /* If we don't know any misalignment values, we prefer 1631 peeling for data-ref that has the maximum number of data-refs 1632 with the same alignment, unless the target prefers to align 1633 stores over load. */ 1634 if (all_misalignments_unknown) 1635 { 1636 unsigned same_align_drs 1637 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1638 if (!dr0 1639 || same_align_drs_max < same_align_drs) 1640 { 1641 same_align_drs_max = same_align_drs; 1642 dr0 = dr; 1643 } 1644 /* For data-refs with the same number of related 1645 accesses prefer the one where the misalign 1646 computation will be invariant in the outermost loop. */ 1647 else if (same_align_drs_max == same_align_drs) 1648 { 1649 struct loop *ivloop0, *ivloop; 1650 ivloop0 = outermost_invariant_loop_for_expr 1651 (loop, DR_BASE_ADDRESS (dr0)); 1652 ivloop = outermost_invariant_loop_for_expr 1653 (loop, DR_BASE_ADDRESS (dr)); 1654 if ((ivloop && !ivloop0) 1655 || (ivloop && ivloop0 1656 && flow_loop_nested_p (ivloop, ivloop0))) 1657 dr0 = dr; 1658 } 1659 1660 if (!first_store && DR_IS_WRITE (dr)) 1661 first_store = dr; 1662 } 1663 1664 /* If there are both known and unknown misaligned accesses in the 1665 loop, we choose peeling amount according to the known 1666 accesses. */ 1667 if (!supportable_dr_alignment) 1668 { 1669 dr0 = dr; 1670 if (!first_store && DR_IS_WRITE (dr)) 1671 first_store = dr; 1672 } 1673 } 1674 } 1675 else 1676 { 1677 if (!aligned_access_p (dr)) 1678 { 1679 if (dump_enabled_p ()) 1680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1681 "vector alignment may not be reachable\n"); 1682 break; 1683 } 1684 } 1685 } 1686 1687 /* Check if we can possibly peel the loop. */ 1688 if (!vect_can_advance_ivs_p (loop_vinfo) 1689 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1690 || loop->inner) 1691 do_peeling = false; 1692 1693 if (do_peeling 1694 && all_misalignments_unknown 1695 && vect_supportable_dr_alignment (dr0, false)) 1696 { 1697 /* Check if the target requires to prefer stores over loads, i.e., if 1698 misaligned stores are more expensive than misaligned loads (taking 1699 drs with same alignment into account). */ 1700 if (first_store && DR_IS_READ (dr0)) 1701 { 1702 unsigned int load_inside_cost = 0, load_outside_cost = 0; 1703 unsigned int store_inside_cost = 0, store_outside_cost = 0; 1704 unsigned int load_inside_penalty = 0, load_outside_penalty = 0; 1705 unsigned int store_inside_penalty = 0, store_outside_penalty = 0; 1706 stmt_vector_for_cost dummy; 1707 dummy.create (2); 1708 1709 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, 1710 &dummy); 1711 vect_get_data_access_cost (first_store, &store_inside_cost, 1712 &store_outside_cost, &dummy); 1713 1714 dummy.release (); 1715 1716 /* Calculate the penalty for leaving FIRST_STORE unaligned (by 1717 aligning the load DR0). */ 1718 load_inside_penalty = store_inside_cost; 1719 load_outside_penalty = store_outside_cost; 1720 for (i = 0; 1721 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1722 DR_STMT (first_store))).iterate (i, &dr); 1723 i++) 1724 if (DR_IS_READ (dr)) 1725 { 1726 load_inside_penalty += load_inside_cost; 1727 load_outside_penalty += load_outside_cost; 1728 } 1729 else 1730 { 1731 load_inside_penalty += store_inside_cost; 1732 load_outside_penalty += store_outside_cost; 1733 } 1734 1735 /* Calculate the penalty for leaving DR0 unaligned (by 1736 aligning the FIRST_STORE). */ 1737 store_inside_penalty = load_inside_cost; 1738 store_outside_penalty = load_outside_cost; 1739 for (i = 0; 1740 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1741 DR_STMT (dr0))).iterate (i, &dr); 1742 i++) 1743 if (DR_IS_READ (dr)) 1744 { 1745 store_inside_penalty += load_inside_cost; 1746 store_outside_penalty += load_outside_cost; 1747 } 1748 else 1749 { 1750 store_inside_penalty += store_inside_cost; 1751 store_outside_penalty += store_outside_cost; 1752 } 1753 1754 if (load_inside_penalty > store_inside_penalty 1755 || (load_inside_penalty == store_inside_penalty 1756 && load_outside_penalty > store_outside_penalty)) 1757 dr0 = first_store; 1758 } 1759 1760 /* In case there are only loads with different unknown misalignments, use 1761 peeling only if it may help to align other accesses in the loop or 1762 if it may help improving load bandwith when we'd end up using 1763 unaligned loads. */ 1764 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0))); 1765 if (!first_store 1766 && !STMT_VINFO_SAME_ALIGN_REFS ( 1767 vinfo_for_stmt (DR_STMT (dr0))).length () 1768 && (vect_supportable_dr_alignment (dr0, false) 1769 != dr_unaligned_supported 1770 || (builtin_vectorization_cost (vector_load, dr0_vt, 0) 1771 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1)))) 1772 do_peeling = false; 1773 } 1774 1775 if (do_peeling && !dr0) 1776 { 1777 /* Peeling is possible, but there is no data access that is not supported 1778 unless aligned. So we try to choose the best possible peeling. */ 1779 1780 /* We should get here only if there are drs with known misalignment. */ 1781 gcc_assert (!all_misalignments_unknown); 1782 1783 /* Choose the best peeling from the hash table. */ 1784 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab, 1785 loop_vinfo, &npeel, 1786 &body_cost_vec); 1787 if (!dr0 || !npeel) 1788 do_peeling = false; 1789 } 1790 1791 if (do_peeling) 1792 { 1793 stmt = DR_STMT (dr0); 1794 stmt_info = vinfo_for_stmt (stmt); 1795 vectype = STMT_VINFO_VECTYPE (stmt_info); 1796 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1797 1798 if (known_alignment_for_access_p (dr0)) 1799 { 1800 bool negative = tree_int_cst_compare (DR_STEP (dr0), 1801 size_zero_node) < 0; 1802 if (!npeel) 1803 { 1804 /* Since it's known at compile time, compute the number of 1805 iterations in the peeled loop (the peeling factor) for use in 1806 updating DR_MISALIGNMENT values. The peeling factor is the 1807 vectorization factor minus the misalignment as an element 1808 count. */ 1809 mis = DR_MISALIGNMENT (dr0); 1810 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); 1811 npeel = ((negative ? mis - nelements : nelements - mis) 1812 & (nelements - 1)); 1813 } 1814 1815 /* For interleaved data access every iteration accesses all the 1816 members of the group, therefore we divide the number of iterations 1817 by the group size. */ 1818 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 1819 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1820 npeel /= GROUP_SIZE (stmt_info); 1821 1822 if (dump_enabled_p ()) 1823 dump_printf_loc (MSG_NOTE, vect_location, 1824 "Try peeling by %d\n", npeel); 1825 } 1826 1827 /* Ensure that all data refs can be vectorized after the peel. */ 1828 FOR_EACH_VEC_ELT (datarefs, i, dr) 1829 { 1830 int save_misalignment; 1831 1832 if (dr == dr0) 1833 continue; 1834 1835 stmt = DR_STMT (dr); 1836 stmt_info = vinfo_for_stmt (stmt); 1837 /* For interleaving, only the alignment of the first access 1838 matters. */ 1839 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1840 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1841 continue; 1842 1843 /* Strided accesses perform only component accesses, alignment is 1844 irrelevant for them. */ 1845 if (STMT_VINFO_STRIDED_P (stmt_info) 1846 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1847 continue; 1848 1849 save_misalignment = DR_MISALIGNMENT (dr); 1850 vect_update_misalignment_for_peel (dr, dr0, npeel); 1851 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1852 SET_DR_MISALIGNMENT (dr, save_misalignment); 1853 1854 if (!supportable_dr_alignment) 1855 { 1856 do_peeling = false; 1857 break; 1858 } 1859 } 1860 1861 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 1862 { 1863 stat = vect_verify_datarefs_alignment (loop_vinfo); 1864 if (!stat) 1865 do_peeling = false; 1866 else 1867 { 1868 body_cost_vec.release (); 1869 return stat; 1870 } 1871 } 1872 1873 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 1874 if (do_peeling) 1875 { 1876 unsigned max_allowed_peel 1877 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); 1878 if (max_allowed_peel != (unsigned)-1) 1879 { 1880 unsigned max_peel = npeel; 1881 if (max_peel == 0) 1882 { 1883 gimple *dr_stmt = DR_STMT (dr0); 1884 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt); 1885 tree vtype = STMT_VINFO_VECTYPE (vinfo); 1886 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1; 1887 } 1888 if (max_peel > max_allowed_peel) 1889 { 1890 do_peeling = false; 1891 if (dump_enabled_p ()) 1892 dump_printf_loc (MSG_NOTE, vect_location, 1893 "Disable peeling, max peels reached: %d\n", max_peel); 1894 } 1895 } 1896 } 1897 1898 /* Cost model #2 - if peeling may result in a remaining loop not 1899 iterating enough to be vectorized then do not peel. */ 1900 if (do_peeling 1901 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1902 { 1903 unsigned max_peel 1904 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel; 1905 if (LOOP_VINFO_INT_NITERS (loop_vinfo) 1906 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel) 1907 do_peeling = false; 1908 } 1909 1910 if (do_peeling) 1911 { 1912 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 1913 If the misalignment of DR_i is identical to that of dr0 then set 1914 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 1915 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 1916 by the peeling factor times the element size of DR_i (MOD the 1917 vectorization factor times the size). Otherwise, the 1918 misalignment of DR_i must be set to unknown. */ 1919 FOR_EACH_VEC_ELT (datarefs, i, dr) 1920 if (dr != dr0) 1921 { 1922 /* Strided accesses perform only component accesses, alignment 1923 is irrelevant for them. */ 1924 stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1925 if (STMT_VINFO_STRIDED_P (stmt_info) 1926 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1927 continue; 1928 1929 vect_update_misalignment_for_peel (dr, dr0, npeel); 1930 } 1931 1932 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 1933 if (npeel) 1934 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 1935 else 1936 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1937 = DR_MISALIGNMENT (dr0); 1938 SET_DR_MISALIGNMENT (dr0, 0); 1939 if (dump_enabled_p ()) 1940 { 1941 dump_printf_loc (MSG_NOTE, vect_location, 1942 "Alignment of access forced using peeling.\n"); 1943 dump_printf_loc (MSG_NOTE, vect_location, 1944 "Peeling for alignment will be applied.\n"); 1945 } 1946 /* The inside-loop cost will be accounted for in vectorizable_load 1947 and vectorizable_store correctly with adjusted alignments. 1948 Drop the body_cst_vec on the floor here. */ 1949 body_cost_vec.release (); 1950 1951 stat = vect_verify_datarefs_alignment (loop_vinfo); 1952 gcc_assert (stat); 1953 return stat; 1954 } 1955 } 1956 1957 body_cost_vec.release (); 1958 1959 /* (2) Versioning to force alignment. */ 1960 1961 /* Try versioning if: 1962 1) optimize loop for speed 1963 2) there is at least one unsupported misaligned data ref with an unknown 1964 misalignment, and 1965 3) all misaligned data refs with a known misalignment are supported, and 1966 4) the number of runtime alignment checks is within reason. */ 1967 1968 do_versioning = 1969 optimize_loop_nest_for_speed_p (loop) 1970 && (!loop->inner); /* FORNOW */ 1971 1972 if (do_versioning) 1973 { 1974 FOR_EACH_VEC_ELT (datarefs, i, dr) 1975 { 1976 stmt = DR_STMT (dr); 1977 stmt_info = vinfo_for_stmt (stmt); 1978 1979 /* For interleaving, only the alignment of the first access 1980 matters. */ 1981 if (aligned_access_p (dr) 1982 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1983 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 1984 continue; 1985 1986 if (STMT_VINFO_STRIDED_P (stmt_info)) 1987 { 1988 /* Strided loads perform only component accesses, alignment is 1989 irrelevant for them. */ 1990 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1991 continue; 1992 do_versioning = false; 1993 break; 1994 } 1995 1996 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1997 1998 if (!supportable_dr_alignment) 1999 { 2000 gimple *stmt; 2001 int mask; 2002 tree vectype; 2003 2004 if (known_alignment_for_access_p (dr) 2005 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2006 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2007 { 2008 do_versioning = false; 2009 break; 2010 } 2011 2012 stmt = DR_STMT (dr); 2013 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2014 gcc_assert (vectype); 2015 2016 /* The rightmost bits of an aligned address must be zeros. 2017 Construct the mask needed for this test. For example, 2018 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2019 mask must be 15 = 0xf. */ 2020 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; 2021 2022 /* FORNOW: use the same mask to test all potentially unaligned 2023 references in the loop. The vectorizer currently supports 2024 a single vector size, see the reference to 2025 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2026 vectorization factor is computed. */ 2027 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2028 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2029 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2030 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2031 DR_STMT (dr)); 2032 } 2033 } 2034 2035 /* Versioning requires at least one misaligned data reference. */ 2036 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2037 do_versioning = false; 2038 else if (!do_versioning) 2039 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2040 } 2041 2042 if (do_versioning) 2043 { 2044 vec<gimple *> may_misalign_stmts 2045 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2046 gimple *stmt; 2047 2048 /* It can now be assumed that the data references in the statements 2049 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2050 of the loop being vectorized. */ 2051 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2052 { 2053 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2054 dr = STMT_VINFO_DATA_REF (stmt_info); 2055 SET_DR_MISALIGNMENT (dr, 0); 2056 if (dump_enabled_p ()) 2057 dump_printf_loc (MSG_NOTE, vect_location, 2058 "Alignment of access forced using versioning.\n"); 2059 } 2060 2061 if (dump_enabled_p ()) 2062 dump_printf_loc (MSG_NOTE, vect_location, 2063 "Versioning for alignment will be applied.\n"); 2064 2065 /* Peeling and versioning can't be done together at this time. */ 2066 gcc_assert (! (do_peeling && do_versioning)); 2067 2068 stat = vect_verify_datarefs_alignment (loop_vinfo); 2069 gcc_assert (stat); 2070 return stat; 2071 } 2072 2073 /* This point is reached if neither peeling nor versioning is being done. */ 2074 gcc_assert (! (do_peeling || do_versioning)); 2075 2076 stat = vect_verify_datarefs_alignment (loop_vinfo); 2077 return stat; 2078 } 2079 2080 2081 /* Function vect_find_same_alignment_drs. 2082 2083 Update group and alignment relations according to the chosen 2084 vectorization factor. */ 2085 2086 static void 2087 vect_find_same_alignment_drs (struct data_dependence_relation *ddr, 2088 loop_vec_info loop_vinfo) 2089 { 2090 unsigned int i; 2091 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2092 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2093 struct data_reference *dra = DDR_A (ddr); 2094 struct data_reference *drb = DDR_B (ddr); 2095 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2096 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2097 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); 2098 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); 2099 lambda_vector dist_v; 2100 unsigned int loop_depth; 2101 2102 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2103 return; 2104 2105 if (dra == drb) 2106 return; 2107 2108 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 2109 return; 2110 2111 /* Loop-based vectorization and known data dependence. */ 2112 if (DDR_NUM_DIST_VECTS (ddr) == 0) 2113 return; 2114 2115 /* Data-dependence analysis reports a distance vector of zero 2116 for data-references that overlap only in the first iteration 2117 but have different sign step (see PR45764). 2118 So as a sanity check require equal DR_STEP. */ 2119 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2120 return; 2121 2122 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 2123 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 2124 { 2125 int dist = dist_v[loop_depth]; 2126 2127 if (dump_enabled_p ()) 2128 dump_printf_loc (MSG_NOTE, vect_location, 2129 "dependence distance = %d.\n", dist); 2130 2131 /* Same loop iteration. */ 2132 if (dist == 0 2133 || (dist % vectorization_factor == 0 && dra_size == drb_size)) 2134 { 2135 /* Two references with distance zero have the same alignment. */ 2136 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2137 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2138 if (dump_enabled_p ()) 2139 { 2140 dump_printf_loc (MSG_NOTE, vect_location, 2141 "accesses have the same alignment.\n"); 2142 dump_printf (MSG_NOTE, 2143 "dependence distance modulo vf == 0 between "); 2144 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2145 dump_printf (MSG_NOTE, " and "); 2146 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2147 dump_printf (MSG_NOTE, "\n"); 2148 } 2149 } 2150 } 2151 } 2152 2153 2154 /* Function vect_analyze_data_refs_alignment 2155 2156 Analyze the alignment of the data-references in the loop. 2157 Return FALSE if a data reference is found that cannot be vectorized. */ 2158 2159 bool 2160 vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2161 { 2162 if (dump_enabled_p ()) 2163 dump_printf_loc (MSG_NOTE, vect_location, 2164 "=== vect_analyze_data_refs_alignment ===\n"); 2165 2166 /* Mark groups of data references with same alignment using 2167 data dependence information. */ 2168 vec<ddr_p> ddrs = vinfo->ddrs; 2169 struct data_dependence_relation *ddr; 2170 unsigned int i; 2171 2172 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2173 vect_find_same_alignment_drs (ddr, vinfo); 2174 2175 vec<data_reference_p> datarefs = vinfo->datarefs; 2176 struct data_reference *dr; 2177 2178 FOR_EACH_VEC_ELT (datarefs, i, dr) 2179 { 2180 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 2181 if (STMT_VINFO_VECTORIZABLE (stmt_info) 2182 && !vect_compute_data_ref_alignment (dr)) 2183 { 2184 /* Strided accesses perform only component accesses, misalignment 2185 information is irrelevant for them. */ 2186 if (STMT_VINFO_STRIDED_P (stmt_info) 2187 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2188 continue; 2189 2190 if (dump_enabled_p ()) 2191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2192 "not vectorized: can't calculate alignment " 2193 "for data ref.\n"); 2194 2195 return false; 2196 } 2197 } 2198 2199 return true; 2200 } 2201 2202 2203 /* Analyze alignment of DRs of stmts in NODE. */ 2204 2205 static bool 2206 vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2207 { 2208 /* We vectorize from the first scalar stmt in the node unless 2209 the node is permuted in which case we start from the first 2210 element in the group. */ 2211 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; 2212 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2213 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2214 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); 2215 2216 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); 2217 if (! vect_compute_data_ref_alignment (dr) 2218 /* For creating the data-ref pointer we need alignment of the 2219 first element anyway. */ 2220 || (dr != first_dr 2221 && ! vect_compute_data_ref_alignment (first_dr)) 2222 || ! verify_data_ref_alignment (dr)) 2223 { 2224 if (dump_enabled_p ()) 2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2226 "not vectorized: bad data alignment in basic " 2227 "block.\n"); 2228 return false; 2229 } 2230 2231 return true; 2232 } 2233 2234 /* Function vect_slp_analyze_instance_alignment 2235 2236 Analyze the alignment of the data-references in the SLP instance. 2237 Return FALSE if a data reference is found that cannot be vectorized. */ 2238 2239 bool 2240 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2241 { 2242 if (dump_enabled_p ()) 2243 dump_printf_loc (MSG_NOTE, vect_location, 2244 "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); 2245 2246 slp_tree node; 2247 unsigned i; 2248 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2249 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2250 return false; 2251 2252 node = SLP_INSTANCE_TREE (instance); 2253 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) 2254 && ! vect_slp_analyze_and_verify_node_alignment 2255 (SLP_INSTANCE_TREE (instance))) 2256 return false; 2257 2258 return true; 2259 } 2260 2261 2262 /* Analyze groups of accesses: check that DR belongs to a group of 2263 accesses of legal size, step, etc. Detect gaps, single element 2264 interleaving, and other special cases. Set grouped access info. 2265 Collect groups of strided stores for further use in SLP analysis. 2266 Worker for vect_analyze_group_access. */ 2267 2268 static bool 2269 vect_analyze_group_access_1 (struct data_reference *dr) 2270 { 2271 tree step = DR_STEP (dr); 2272 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2273 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2274 gimple *stmt = DR_STMT (dr); 2275 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2276 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2277 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2278 HOST_WIDE_INT dr_step = -1; 2279 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2280 bool slp_impossible = false; 2281 2282 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2283 size of the interleaving group (including gaps). */ 2284 if (tree_fits_shwi_p (step)) 2285 { 2286 dr_step = tree_to_shwi (step); 2287 /* Check that STEP is a multiple of type size. Otherwise there is 2288 a non-element-sized gap at the end of the group which we 2289 cannot represent in GROUP_GAP or GROUP_SIZE. 2290 ??? As we can handle non-constant step fine here we should 2291 simply remove uses of GROUP_GAP between the last and first 2292 element and instead rely on DR_STEP. GROUP_SIZE then would 2293 simply not include that gap. */ 2294 if ((dr_step % type_size) != 0) 2295 { 2296 if (dump_enabled_p ()) 2297 { 2298 dump_printf_loc (MSG_NOTE, vect_location, 2299 "Step "); 2300 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2301 dump_printf (MSG_NOTE, 2302 " is not a multiple of the element size for "); 2303 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2304 dump_printf (MSG_NOTE, "\n"); 2305 } 2306 return false; 2307 } 2308 groupsize = absu_hwi (dr_step) / type_size; 2309 } 2310 else 2311 groupsize = 0; 2312 2313 /* Not consecutive access is possible only if it is a part of interleaving. */ 2314 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2315 { 2316 /* Check if it this DR is a part of interleaving, and is a single 2317 element of the group that is accessed in the loop. */ 2318 2319 /* Gaps are supported only for loads. STEP must be a multiple of the type 2320 size. The size of the group must be a power of 2. */ 2321 if (DR_IS_READ (dr) 2322 && (dr_step % type_size) == 0 2323 && groupsize > 0 2324 && pow2p_hwi (groupsize)) 2325 { 2326 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2327 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2328 GROUP_GAP (stmt_info) = groupsize - 1; 2329 if (dump_enabled_p ()) 2330 { 2331 dump_printf_loc (MSG_NOTE, vect_location, 2332 "Detected single element interleaving "); 2333 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2334 dump_printf (MSG_NOTE, " step "); 2335 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2336 dump_printf (MSG_NOTE, "\n"); 2337 } 2338 2339 return true; 2340 } 2341 2342 if (dump_enabled_p ()) 2343 { 2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2345 "not consecutive access "); 2346 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2347 } 2348 2349 if (bb_vinfo) 2350 { 2351 /* Mark the statement as unvectorizable. */ 2352 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2353 return true; 2354 } 2355 2356 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2357 STMT_VINFO_STRIDED_P (stmt_info) = true; 2358 return true; 2359 } 2360 2361 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2362 { 2363 /* First stmt in the interleaving chain. Check the chain. */ 2364 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2365 struct data_reference *data_ref = dr; 2366 unsigned int count = 1; 2367 tree prev_init = DR_INIT (data_ref); 2368 gimple *prev = stmt; 2369 HOST_WIDE_INT diff, gaps = 0; 2370 2371 while (next) 2372 { 2373 /* Skip same data-refs. In case that two or more stmts share 2374 data-ref (supported only for loads), we vectorize only the first 2375 stmt, and the rest get their vectorized loads from the first 2376 one. */ 2377 if (!tree_int_cst_compare (DR_INIT (data_ref), 2378 DR_INIT (STMT_VINFO_DATA_REF ( 2379 vinfo_for_stmt (next))))) 2380 { 2381 if (DR_IS_WRITE (data_ref)) 2382 { 2383 if (dump_enabled_p ()) 2384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2385 "Two store stmts share the same dr.\n"); 2386 return false; 2387 } 2388 2389 if (dump_enabled_p ()) 2390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2391 "Two or more load stmts share the same dr.\n"); 2392 2393 /* For load use the same data-ref load. */ 2394 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2395 2396 prev = next; 2397 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2398 continue; 2399 } 2400 2401 prev = next; 2402 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2403 2404 /* All group members have the same STEP by construction. */ 2405 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2406 2407 /* Check that the distance between two accesses is equal to the type 2408 size. Otherwise, we have gaps. */ 2409 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2410 - TREE_INT_CST_LOW (prev_init)) / type_size; 2411 if (diff != 1) 2412 { 2413 /* FORNOW: SLP of accesses with gaps is not supported. */ 2414 slp_impossible = true; 2415 if (DR_IS_WRITE (data_ref)) 2416 { 2417 if (dump_enabled_p ()) 2418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2419 "interleaved store with gaps\n"); 2420 return false; 2421 } 2422 2423 gaps += diff - 1; 2424 } 2425 2426 last_accessed_element += diff; 2427 2428 /* Store the gap from the previous member of the group. If there is no 2429 gap in the access, GROUP_GAP is always 1. */ 2430 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2431 2432 prev_init = DR_INIT (data_ref); 2433 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2434 /* Count the number of data-refs in the chain. */ 2435 count++; 2436 } 2437 2438 if (groupsize == 0) 2439 groupsize = count + gaps; 2440 2441 /* This could be UINT_MAX but as we are generating code in a very 2442 inefficient way we have to cap earlier. See PR78699 for example. */ 2443 if (groupsize > 4096) 2444 { 2445 if (dump_enabled_p ()) 2446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2447 "group is too large\n"); 2448 return false; 2449 } 2450 2451 /* Check that the size of the interleaving is equal to count for stores, 2452 i.e., that there are no gaps. */ 2453 if (groupsize != count 2454 && !DR_IS_READ (dr)) 2455 { 2456 if (dump_enabled_p ()) 2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2458 "interleaved store with gaps\n"); 2459 return false; 2460 } 2461 2462 /* If there is a gap after the last load in the group it is the 2463 difference between the groupsize and the last accessed 2464 element. 2465 When there is no gap, this difference should be 0. */ 2466 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; 2467 2468 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2469 if (dump_enabled_p ()) 2470 { 2471 dump_printf_loc (MSG_NOTE, vect_location, 2472 "Detected interleaving "); 2473 if (DR_IS_READ (dr)) 2474 dump_printf (MSG_NOTE, "load "); 2475 else 2476 dump_printf (MSG_NOTE, "store "); 2477 dump_printf (MSG_NOTE, "of size %u starting with ", 2478 (unsigned)groupsize); 2479 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 2480 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) 2481 dump_printf_loc (MSG_NOTE, vect_location, 2482 "There is a gap of %u elements after the group\n", 2483 GROUP_GAP (vinfo_for_stmt (stmt))); 2484 } 2485 2486 /* SLP: create an SLP data structure for every interleaving group of 2487 stores for further analysis in vect_analyse_slp. */ 2488 if (DR_IS_WRITE (dr) && !slp_impossible) 2489 { 2490 if (loop_vinfo) 2491 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2492 if (bb_vinfo) 2493 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2494 } 2495 } 2496 2497 return true; 2498 } 2499 2500 /* Analyze groups of accesses: check that DR belongs to a group of 2501 accesses of legal size, step, etc. Detect gaps, single element 2502 interleaving, and other special cases. Set grouped access info. 2503 Collect groups of strided stores for further use in SLP analysis. */ 2504 2505 static bool 2506 vect_analyze_group_access (struct data_reference *dr) 2507 { 2508 if (!vect_analyze_group_access_1 (dr)) 2509 { 2510 /* Dissolve the group if present. */ 2511 gimple *next; 2512 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); 2513 while (stmt) 2514 { 2515 stmt_vec_info vinfo = vinfo_for_stmt (stmt); 2516 next = GROUP_NEXT_ELEMENT (vinfo); 2517 GROUP_FIRST_ELEMENT (vinfo) = NULL; 2518 GROUP_NEXT_ELEMENT (vinfo) = NULL; 2519 stmt = next; 2520 } 2521 return false; 2522 } 2523 return true; 2524 } 2525 2526 /* Analyze the access pattern of the data-reference DR. 2527 In case of non-consecutive accesses call vect_analyze_group_access() to 2528 analyze groups of accesses. */ 2529 2530 static bool 2531 vect_analyze_data_ref_access (struct data_reference *dr) 2532 { 2533 tree step = DR_STEP (dr); 2534 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2535 gimple *stmt = DR_STMT (dr); 2536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2537 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2538 struct loop *loop = NULL; 2539 2540 if (loop_vinfo) 2541 loop = LOOP_VINFO_LOOP (loop_vinfo); 2542 2543 if (loop_vinfo && !step) 2544 { 2545 if (dump_enabled_p ()) 2546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2547 "bad data-ref access in loop\n"); 2548 return false; 2549 } 2550 2551 /* Allow loads with zero step in inner-loop vectorization. */ 2552 if (loop_vinfo && integer_zerop (step)) 2553 { 2554 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2555 if (!nested_in_vect_loop_p (loop, stmt)) 2556 return DR_IS_READ (dr); 2557 /* Allow references with zero step for outer loops marked 2558 with pragma omp simd only - it guarantees absence of 2559 loop-carried dependencies between inner loop iterations. */ 2560 if (!loop->force_vectorize || loop->safelen < 2) 2561 { 2562 if (dump_enabled_p ()) 2563 dump_printf_loc (MSG_NOTE, vect_location, 2564 "zero step in inner loop of nest\n"); 2565 return false; 2566 } 2567 } 2568 2569 if (loop && nested_in_vect_loop_p (loop, stmt)) 2570 { 2571 /* Interleaved accesses are not yet supported within outer-loop 2572 vectorization for references in the inner-loop. */ 2573 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2574 2575 /* For the rest of the analysis we use the outer-loop step. */ 2576 step = STMT_VINFO_DR_STEP (stmt_info); 2577 if (integer_zerop (step)) 2578 { 2579 if (dump_enabled_p ()) 2580 dump_printf_loc (MSG_NOTE, vect_location, 2581 "zero step in outer loop.\n"); 2582 return DR_IS_READ (dr); 2583 } 2584 } 2585 2586 /* Consecutive? */ 2587 if (TREE_CODE (step) == INTEGER_CST) 2588 { 2589 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2590 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2591 || (dr_step < 0 2592 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2593 { 2594 /* Mark that it is not interleaving. */ 2595 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2596 return true; 2597 } 2598 } 2599 2600 if (loop && nested_in_vect_loop_p (loop, stmt)) 2601 { 2602 if (dump_enabled_p ()) 2603 dump_printf_loc (MSG_NOTE, vect_location, 2604 "grouped access in outer loop.\n"); 2605 return false; 2606 } 2607 2608 2609 /* Assume this is a DR handled by non-constant strided load case. */ 2610 if (TREE_CODE (step) != INTEGER_CST) 2611 return (STMT_VINFO_STRIDED_P (stmt_info) 2612 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2613 || vect_analyze_group_access (dr))); 2614 2615 /* Not consecutive access - check if it's a part of interleaving group. */ 2616 return vect_analyze_group_access (dr); 2617 } 2618 2619 2620 2621 /* A helper function used in the comparator function to sort data 2622 references. T1 and T2 are two data references to be compared. 2623 The function returns -1, 0, or 1. */ 2624 2625 static int 2626 compare_tree (tree t1, tree t2) 2627 { 2628 int i, cmp; 2629 enum tree_code code; 2630 char tclass; 2631 2632 if (t1 == t2) 2633 return 0; 2634 if (t1 == NULL) 2635 return -1; 2636 if (t2 == NULL) 2637 return 1; 2638 2639 STRIP_NOPS (t1); 2640 STRIP_NOPS (t2); 2641 2642 if (TREE_CODE (t1) != TREE_CODE (t2)) 2643 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1; 2644 2645 code = TREE_CODE (t1); 2646 switch (code) 2647 { 2648 /* For const values, we can just use hash values for comparisons. */ 2649 case INTEGER_CST: 2650 case REAL_CST: 2651 case FIXED_CST: 2652 case STRING_CST: 2653 case COMPLEX_CST: 2654 case VECTOR_CST: 2655 { 2656 hashval_t h1 = iterative_hash_expr (t1, 0); 2657 hashval_t h2 = iterative_hash_expr (t2, 0); 2658 if (h1 != h2) 2659 return h1 < h2 ? -1 : 1; 2660 break; 2661 } 2662 2663 case SSA_NAME: 2664 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2)); 2665 if (cmp != 0) 2666 return cmp; 2667 2668 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2)) 2669 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1; 2670 break; 2671 2672 default: 2673 tclass = TREE_CODE_CLASS (code); 2674 2675 /* For var-decl, we could compare their UIDs. */ 2676 if (tclass == tcc_declaration) 2677 { 2678 if (DECL_UID (t1) != DECL_UID (t2)) 2679 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1; 2680 break; 2681 } 2682 2683 /* For expressions with operands, compare their operands recursively. */ 2684 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i) 2685 { 2686 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i)); 2687 if (cmp != 0) 2688 return cmp; 2689 } 2690 } 2691 2692 return 0; 2693 } 2694 2695 2696 /* Compare two data-references DRA and DRB to group them into chunks 2697 suitable for grouping. */ 2698 2699 static int 2700 dr_group_sort_cmp (const void *dra_, const void *drb_) 2701 { 2702 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2703 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2704 int cmp; 2705 2706 /* Stabilize sort. */ 2707 if (dra == drb) 2708 return 0; 2709 2710 /* DRs in different loops never belong to the same group. */ 2711 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2712 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2713 if (loopa != loopb) 2714 return loopa->num < loopb->num ? -1 : 1; 2715 2716 /* Ordering of DRs according to base. */ 2717 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)) 2718 { 2719 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)); 2720 if (cmp != 0) 2721 return cmp; 2722 } 2723 2724 /* And according to DR_OFFSET. */ 2725 if (!dr_equal_offsets_p (dra, drb)) 2726 { 2727 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2728 if (cmp != 0) 2729 return cmp; 2730 } 2731 2732 /* Put reads before writes. */ 2733 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2734 return DR_IS_READ (dra) ? -1 : 1; 2735 2736 /* Then sort after access size. */ 2737 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2738 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0)) 2739 { 2740 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2741 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2742 if (cmp != 0) 2743 return cmp; 2744 } 2745 2746 /* And after step. */ 2747 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2748 { 2749 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb)); 2750 if (cmp != 0) 2751 return cmp; 2752 } 2753 2754 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2755 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)); 2756 if (cmp == 0) 2757 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2758 return cmp; 2759 } 2760 2761 /* Function vect_analyze_data_ref_accesses. 2762 2763 Analyze the access pattern of all the data references in the loop. 2764 2765 FORNOW: the only access pattern that is considered vectorizable is a 2766 simple step 1 (consecutive) access. 2767 2768 FORNOW: handle only arrays and pointer accesses. */ 2769 2770 bool 2771 vect_analyze_data_ref_accesses (vec_info *vinfo) 2772 { 2773 unsigned int i; 2774 vec<data_reference_p> datarefs = vinfo->datarefs; 2775 struct data_reference *dr; 2776 2777 if (dump_enabled_p ()) 2778 dump_printf_loc (MSG_NOTE, vect_location, 2779 "=== vect_analyze_data_ref_accesses ===\n"); 2780 2781 if (datarefs.is_empty ()) 2782 return true; 2783 2784 /* Sort the array of datarefs to make building the interleaving chains 2785 linear. Don't modify the original vector's order, it is needed for 2786 determining what dependencies are reversed. */ 2787 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2788 datarefs_copy.qsort (dr_group_sort_cmp); 2789 2790 /* Build the interleaving chains. */ 2791 for (i = 0; i < datarefs_copy.length () - 1;) 2792 { 2793 data_reference_p dra = datarefs_copy[i]; 2794 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2795 stmt_vec_info lastinfo = NULL; 2796 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a)) 2797 { 2798 ++i; 2799 continue; 2800 } 2801 for (i = i + 1; i < datarefs_copy.length (); ++i) 2802 { 2803 data_reference_p drb = datarefs_copy[i]; 2804 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2805 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 2806 break; 2807 2808 /* ??? Imperfect sorting (non-compatible types, non-modulo 2809 accesses, same accesses) can lead to a group to be artificially 2810 split here as we don't just skip over those. If it really 2811 matters we can push those to a worklist and re-iterate 2812 over them. The we can just skip ahead to the next DR here. */ 2813 2814 /* DRs in a different loop should not be put into the same 2815 interleaving group. */ 2816 if (gimple_bb (DR_STMT (dra))->loop_father 2817 != gimple_bb (DR_STMT (drb))->loop_father) 2818 break; 2819 2820 /* Check that the data-refs have same first location (except init) 2821 and they are both either store or load (not load and store, 2822 not masked loads or stores). */ 2823 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2824 || !operand_equal_p (DR_BASE_ADDRESS (dra), 2825 DR_BASE_ADDRESS (drb), 0) 2826 || !dr_equal_offsets_p (dra, drb) 2827 || !gimple_assign_single_p (DR_STMT (dra)) 2828 || !gimple_assign_single_p (DR_STMT (drb))) 2829 break; 2830 2831 /* Check that the data-refs have the same constant size. */ 2832 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 2833 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 2834 if (!tree_fits_uhwi_p (sza) 2835 || !tree_fits_uhwi_p (szb) 2836 || !tree_int_cst_equal (sza, szb)) 2837 break; 2838 2839 /* Check that the data-refs have the same step. */ 2840 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2841 break; 2842 2843 /* Do not place the same access in the interleaving chain twice. */ 2844 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) 2845 break; 2846 2847 /* Check the types are compatible. 2848 ??? We don't distinguish this during sorting. */ 2849 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 2850 TREE_TYPE (DR_REF (drb)))) 2851 break; 2852 2853 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 2854 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 2855 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 2856 gcc_assert (init_a <= init_b); 2857 2858 /* If init_b == init_a + the size of the type * k, we have an 2859 interleaving, and DRA is accessed before DRB. */ 2860 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 2861 if (type_size_a == 0 2862 || (init_b - init_a) % type_size_a != 0) 2863 break; 2864 2865 /* If we have a store, the accesses are adjacent. This splits 2866 groups into chunks we support (we don't support vectorization 2867 of stores with gaps). */ 2868 if (!DR_IS_READ (dra) 2869 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW 2870 (DR_INIT (datarefs_copy[i-1])) 2871 != type_size_a)) 2872 break; 2873 2874 /* If the step (if not zero or non-constant) is greater than the 2875 difference between data-refs' inits this splits groups into 2876 suitable sizes. */ 2877 if (tree_fits_shwi_p (DR_STEP (dra))) 2878 { 2879 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 2880 if (step != 0 && step <= (init_b - init_a)) 2881 break; 2882 } 2883 2884 if (dump_enabled_p ()) 2885 { 2886 dump_printf_loc (MSG_NOTE, vect_location, 2887 "Detected interleaving "); 2888 if (DR_IS_READ (dra)) 2889 dump_printf (MSG_NOTE, "load "); 2890 else 2891 dump_printf (MSG_NOTE, "store "); 2892 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2893 dump_printf (MSG_NOTE, " and "); 2894 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2895 dump_printf (MSG_NOTE, "\n"); 2896 } 2897 2898 /* Link the found element into the group list. */ 2899 if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) 2900 { 2901 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); 2902 lastinfo = stmtinfo_a; 2903 } 2904 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); 2905 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); 2906 lastinfo = stmtinfo_b; 2907 } 2908 } 2909 2910 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 2911 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 2912 && !vect_analyze_data_ref_access (dr)) 2913 { 2914 if (dump_enabled_p ()) 2915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2916 "not vectorized: complicated access pattern.\n"); 2917 2918 if (is_a <bb_vec_info> (vinfo)) 2919 { 2920 /* Mark the statement as not vectorizable. */ 2921 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2922 continue; 2923 } 2924 else 2925 { 2926 datarefs_copy.release (); 2927 return false; 2928 } 2929 } 2930 2931 datarefs_copy.release (); 2932 return true; 2933 } 2934 2935 2936 /* Operator == between two dr_with_seg_len objects. 2937 2938 This equality operator is used to make sure two data refs 2939 are the same one so that we will consider to combine the 2940 aliasing checks of those two pairs of data dependent data 2941 refs. */ 2942 2943 static bool 2944 operator == (const dr_with_seg_len& d1, 2945 const dr_with_seg_len& d2) 2946 { 2947 return operand_equal_p (DR_BASE_ADDRESS (d1.dr), 2948 DR_BASE_ADDRESS (d2.dr), 0) 2949 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0 2950 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0 2951 && compare_tree (d1.seg_len, d2.seg_len) == 0; 2952 } 2953 2954 /* Function comp_dr_with_seg_len_pair. 2955 2956 Comparison function for sorting objects of dr_with_seg_len_pair_t 2957 so that we can combine aliasing checks in one scan. */ 2958 2959 static int 2960 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_) 2961 { 2962 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_; 2963 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_; 2964 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second; 2965 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second; 2966 2967 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks 2968 if a and c have the same basic address snd step, and b and d have the same 2969 address and step. Therefore, if any a&c or b&d don't have the same address 2970 and step, we don't care the order of those two pairs after sorting. */ 2971 int comp_res; 2972 2973 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr), 2974 DR_BASE_ADDRESS (b1.dr))) != 0) 2975 return comp_res; 2976 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr), 2977 DR_BASE_ADDRESS (b2.dr))) != 0) 2978 return comp_res; 2979 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0) 2980 return comp_res; 2981 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0) 2982 return comp_res; 2983 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0) 2984 return comp_res; 2985 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0) 2986 return comp_res; 2987 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0) 2988 return comp_res; 2989 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0) 2990 return comp_res; 2991 2992 return 0; 2993 } 2994 2995 /* Function vect_vfa_segment_size. 2996 2997 Create an expression that computes the size of segment 2998 that will be accessed for a data reference. The functions takes into 2999 account that realignment loads may access one more vector. 3000 3001 Input: 3002 DR: The data reference. 3003 LENGTH_FACTOR: segment length to consider. 3004 3005 Return an expression whose value is the size of segment which will be 3006 accessed by DR. */ 3007 3008 static tree 3009 vect_vfa_segment_size (struct data_reference *dr, tree length_factor) 3010 { 3011 tree segment_length; 3012 3013 if (integer_zerop (DR_STEP (dr))) 3014 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 3015 else 3016 segment_length = size_binop (MULT_EXPR, 3017 fold_convert (sizetype, DR_STEP (dr)), 3018 fold_convert (sizetype, length_factor)); 3019 3020 if (vect_supportable_dr_alignment (dr, false) 3021 == dr_explicit_realign_optimized) 3022 { 3023 tree vector_size = TYPE_SIZE_UNIT 3024 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); 3025 3026 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size); 3027 } 3028 return segment_length; 3029 } 3030 3031 /* Function vect_no_alias_p. 3032 3033 Given data references A and B with equal base and offset, the alias 3034 relation can be decided at compilation time, return TRUE if they do 3035 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A 3036 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B 3037 respectively. */ 3038 3039 static bool 3040 vect_no_alias_p (struct data_reference *a, struct data_reference *b, 3041 tree segment_length_a, tree segment_length_b) 3042 { 3043 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST 3044 && TREE_CODE (DR_INIT (b)) == INTEGER_CST); 3045 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b))) 3046 return false; 3047 3048 tree seg_a_min = DR_INIT (a); 3049 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min), 3050 seg_a_min, segment_length_a); 3051 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3052 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3053 [a, a+12) */ 3054 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) 3055 { 3056 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a))); 3057 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max), 3058 seg_a_max, unit_size); 3059 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)), 3060 DR_INIT (a), unit_size); 3061 } 3062 tree seg_b_min = DR_INIT (b); 3063 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min), 3064 seg_b_min, segment_length_b); 3065 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) 3066 { 3067 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b))); 3068 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max), 3069 seg_b_max, unit_size); 3070 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)), 3071 DR_INIT (b), unit_size); 3072 } 3073 3074 if (tree_int_cst_le (seg_a_max, seg_b_min) 3075 || tree_int_cst_le (seg_b_max, seg_a_min)) 3076 return true; 3077 3078 return false; 3079 } 3080 3081 /* Function vect_prune_runtime_alias_test_list. 3082 3083 Prune a list of ddrs to be tested at run-time by versioning for alias. 3084 Merge several alias checks into one if possible. 3085 Return FALSE if resulting list of ddrs is longer then allowed by 3086 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3087 3088 bool 3089 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3090 { 3091 vec<ddr_p> may_alias_ddrs = 3092 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3093 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs = 3094 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3095 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3096 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3097 3098 ddr_p ddr; 3099 unsigned int i; 3100 tree length_factor; 3101 3102 if (dump_enabled_p ()) 3103 dump_printf_loc (MSG_NOTE, vect_location, 3104 "=== vect_prune_runtime_alias_test_list ===\n"); 3105 3106 if (may_alias_ddrs.is_empty ()) 3107 return true; 3108 3109 /* Basically, for each pair of dependent data refs store_ptr_0 3110 and load_ptr_0, we create an expression: 3111 3112 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 3113 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0)) 3114 3115 for aliasing checks. However, in some cases we can decrease 3116 the number of checks by combining two checks into one. For 3117 example, suppose we have another pair of data refs store_ptr_0 3118 and load_ptr_1, and if the following condition is satisfied: 3119 3120 load_ptr_0 < load_ptr_1 && 3121 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0 3122 3123 (this condition means, in each iteration of vectorized loop, 3124 the accessed memory of store_ptr_0 cannot be between the memory 3125 of load_ptr_0 and load_ptr_1.) 3126 3127 we then can use only the following expression to finish the 3128 alising checks between store_ptr_0 & load_ptr_0 and 3129 store_ptr_0 & load_ptr_1: 3130 3131 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) 3132 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0)) 3133 3134 Note that we only consider that load_ptr_0 and load_ptr_1 have the 3135 same basic address. */ 3136 3137 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3138 3139 /* First, we collect all data ref pairs for aliasing checks. */ 3140 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3141 { 3142 int comp_res; 3143 struct data_reference *dr_a, *dr_b; 3144 gimple *dr_group_first_a, *dr_group_first_b; 3145 tree segment_length_a, segment_length_b; 3146 gimple *stmt_a, *stmt_b; 3147 3148 dr_a = DDR_A (ddr); 3149 stmt_a = DR_STMT (DDR_A (ddr)); 3150 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); 3151 if (dr_group_first_a) 3152 { 3153 stmt_a = dr_group_first_a; 3154 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); 3155 } 3156 3157 dr_b = DDR_B (ddr); 3158 stmt_b = DR_STMT (DDR_B (ddr)); 3159 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); 3160 if (dr_group_first_b) 3161 { 3162 stmt_b = dr_group_first_b; 3163 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); 3164 } 3165 3166 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) 3167 length_factor = scalar_loop_iters; 3168 else 3169 length_factor = size_int (vect_factor); 3170 segment_length_a = vect_vfa_segment_size (dr_a, length_factor); 3171 segment_length_b = vect_vfa_segment_size (dr_b, length_factor); 3172 3173 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)); 3174 if (comp_res == 0) 3175 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); 3176 3177 /* Alias is known at compilation time. */ 3178 if (comp_res == 0 3179 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST 3180 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST 3181 && TREE_CODE (segment_length_a) == INTEGER_CST 3182 && TREE_CODE (segment_length_b) == INTEGER_CST) 3183 { 3184 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b)) 3185 continue; 3186 3187 if (dump_enabled_p ()) 3188 dump_printf_loc (MSG_NOTE, vect_location, 3189 "not vectorized: compilation time alias.\n"); 3190 3191 return false; 3192 } 3193 3194 dr_with_seg_len_pair_t dr_with_seg_len_pair 3195 (dr_with_seg_len (dr_a, segment_length_a), 3196 dr_with_seg_len (dr_b, segment_length_b)); 3197 3198 /* Canonicalize pairs by sorting the two DR members. */ 3199 if (comp_res > 0) 3200 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 3201 3202 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3203 } 3204 3205 /* Second, we sort the collected data ref pairs so that we can scan 3206 them once to combine all possible aliasing checks. */ 3207 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair); 3208 3209 /* Third, we scan the sorted dr pairs and check if we can combine 3210 alias checks of two neighboring dr pairs. */ 3211 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i) 3212 { 3213 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ 3214 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first, 3215 *dr_b1 = &comp_alias_ddrs[i-1].second, 3216 *dr_a2 = &comp_alias_ddrs[i].first, 3217 *dr_b2 = &comp_alias_ddrs[i].second; 3218 3219 /* Remove duplicate data ref pairs. */ 3220 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) 3221 { 3222 if (dump_enabled_p ()) 3223 { 3224 dump_printf_loc (MSG_NOTE, vect_location, 3225 "found equal ranges "); 3226 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3227 DR_REF (dr_a1->dr)); 3228 dump_printf (MSG_NOTE, ", "); 3229 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3230 DR_REF (dr_b1->dr)); 3231 dump_printf (MSG_NOTE, " and "); 3232 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3233 DR_REF (dr_a2->dr)); 3234 dump_printf (MSG_NOTE, ", "); 3235 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3236 DR_REF (dr_b2->dr)); 3237 dump_printf (MSG_NOTE, "\n"); 3238 } 3239 3240 comp_alias_ddrs.ordered_remove (i--); 3241 continue; 3242 } 3243 3244 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) 3245 { 3246 /* We consider the case that DR_B1 and DR_B2 are same memrefs, 3247 and DR_A1 and DR_A2 are two consecutive memrefs. */ 3248 if (*dr_a1 == *dr_a2) 3249 { 3250 std::swap (dr_a1, dr_b1); 3251 std::swap (dr_a2, dr_b2); 3252 } 3253 3254 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), 3255 DR_BASE_ADDRESS (dr_a2->dr), 0) 3256 || !operand_equal_p (DR_OFFSET (dr_a1->dr), 3257 DR_OFFSET (dr_a2->dr), 0) 3258 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr)) 3259 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr))) 3260 continue; 3261 3262 /* Make sure dr_a1 starts left of dr_a2. */ 3263 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr))) 3264 std::swap (*dr_a1, *dr_a2); 3265 3266 bool do_remove = false; 3267 unsigned HOST_WIDE_INT diff 3268 = (tree_to_shwi (DR_INIT (dr_a2->dr)) 3269 - tree_to_shwi (DR_INIT (dr_a1->dr))); 3270 3271 /* If the left segment does not extend beyond the start of the 3272 right segment the new segment length is that of the right 3273 plus the segment distance. */ 3274 if (tree_fits_uhwi_p (dr_a1->seg_len) 3275 && compare_tree_int (dr_a1->seg_len, diff) <= 0) 3276 { 3277 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len, 3278 size_int (diff)); 3279 do_remove = true; 3280 } 3281 /* Generally the new segment length is the maximum of the 3282 left segment size and the right segment size plus the distance. 3283 ??? We can also build tree MAX_EXPR here but it's not clear this 3284 is profitable. */ 3285 else if (tree_fits_uhwi_p (dr_a1->seg_len) 3286 && tree_fits_uhwi_p (dr_a2->seg_len)) 3287 { 3288 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len); 3289 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len); 3290 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2)); 3291 do_remove = true; 3292 } 3293 /* Now we check if the following condition is satisfied: 3294 3295 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B 3296 3297 where DIFF = DR_A2_INIT - DR_A1_INIT. However, 3298 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we 3299 have to make a best estimation. We can get the minimum value 3300 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B, 3301 then either of the following two conditions can guarantee the 3302 one above: 3303 3304 1: DIFF <= MIN_SEG_LEN_B 3305 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */ 3306 else 3307 { 3308 unsigned HOST_WIDE_INT min_seg_len_b 3309 = (tree_fits_uhwi_p (dr_b1->seg_len) 3310 ? tree_to_uhwi (dr_b1->seg_len) 3311 : vect_factor); 3312 3313 if (diff <= min_seg_len_b 3314 || (tree_fits_uhwi_p (dr_a1->seg_len) 3315 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b)) 3316 { 3317 dr_a1->seg_len = size_binop (PLUS_EXPR, 3318 dr_a2->seg_len, size_int (diff)); 3319 do_remove = true; 3320 } 3321 } 3322 3323 if (do_remove) 3324 { 3325 if (dump_enabled_p ()) 3326 { 3327 dump_printf_loc (MSG_NOTE, vect_location, 3328 "merging ranges for "); 3329 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr)); 3330 dump_printf (MSG_NOTE, ", "); 3331 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr)); 3332 dump_printf (MSG_NOTE, " and "); 3333 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr)); 3334 dump_printf (MSG_NOTE, ", "); 3335 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr)); 3336 dump_printf (MSG_NOTE, "\n"); 3337 } 3338 comp_alias_ddrs.ordered_remove (i--); 3339 } 3340 } 3341 } 3342 3343 dump_printf_loc (MSG_NOTE, vect_location, 3344 "improved number of alias checks from %d to %d\n", 3345 may_alias_ddrs.length (), comp_alias_ddrs.length ()); 3346 if ((int) comp_alias_ddrs.length () > 3347 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 3348 { 3349 if (dump_enabled_p ()) 3350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3351 "number of versioning for alias " 3352 "run-time tests exceeds %d " 3353 "(--param vect-max-version-for-alias-checks)\n", 3354 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); 3355 return false; 3356 } 3357 3358 /* All alias checks have been resolved at compilation time. */ 3359 if (!comp_alias_ddrs.length ()) 3360 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0); 3361 3362 return true; 3363 } 3364 3365 /* Return true if a non-affine read or write in STMT is suitable for a 3366 gather load or scatter store. Describe the operation in *INFO if so. */ 3367 3368 bool 3369 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, 3370 gather_scatter_info *info) 3371 { 3372 HOST_WIDE_INT scale = 1, pbitpos, pbitsize; 3373 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3374 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3375 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3376 tree offtype = NULL_TREE; 3377 tree decl, base, off; 3378 machine_mode pmode; 3379 int punsignedp, reversep, pvolatilep = 0; 3380 3381 base = DR_REF (dr); 3382 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3383 see if we can use the def stmt of the address. */ 3384 if (is_gimple_call (stmt) 3385 && gimple_call_internal_p (stmt) 3386 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 3387 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE) 3388 && TREE_CODE (base) == MEM_REF 3389 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3390 && integer_zerop (TREE_OPERAND (base, 1)) 3391 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3392 { 3393 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3394 if (is_gimple_assign (def_stmt) 3395 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3396 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3397 } 3398 3399 /* The gather and scatter builtins need address of the form 3400 loop_invariant + vector * {1, 2, 4, 8} 3401 or 3402 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3403 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3404 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3405 multiplications and additions in it. To get a vector, we need 3406 a single SSA_NAME that will be defined in the loop and will 3407 contain everything that is not loop invariant and that can be 3408 vectorized. The following code attempts to find such a preexistng 3409 SSA_NAME OFF and put the loop invariants into a tree BASE 3410 that can be gimplified before the loop. */ 3411 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3412 &punsignedp, &reversep, &pvolatilep); 3413 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep); 3414 3415 if (TREE_CODE (base) == MEM_REF) 3416 { 3417 if (!integer_zerop (TREE_OPERAND (base, 1))) 3418 { 3419 if (off == NULL_TREE) 3420 { 3421 offset_int moff = mem_ref_offset (base); 3422 off = wide_int_to_tree (sizetype, moff); 3423 } 3424 else 3425 off = size_binop (PLUS_EXPR, off, 3426 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3427 } 3428 base = TREE_OPERAND (base, 0); 3429 } 3430 else 3431 base = build_fold_addr_expr (base); 3432 3433 if (off == NULL_TREE) 3434 off = size_zero_node; 3435 3436 /* If base is not loop invariant, either off is 0, then we start with just 3437 the constant offset in the loop invariant BASE and continue with base 3438 as OFF, otherwise give up. 3439 We could handle that case by gimplifying the addition of base + off 3440 into some SSA_NAME and use that as off, but for now punt. */ 3441 if (!expr_invariant_in_loop_p (loop, base)) 3442 { 3443 if (!integer_zerop (off)) 3444 return false; 3445 off = base; 3446 base = size_int (pbitpos / BITS_PER_UNIT); 3447 } 3448 /* Otherwise put base + constant offset into the loop invariant BASE 3449 and continue with OFF. */ 3450 else 3451 { 3452 base = fold_convert (sizetype, base); 3453 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); 3454 } 3455 3456 /* OFF at this point may be either a SSA_NAME or some tree expression 3457 from get_inner_reference. Try to peel off loop invariants from it 3458 into BASE as long as possible. */ 3459 STRIP_NOPS (off); 3460 while (offtype == NULL_TREE) 3461 { 3462 enum tree_code code; 3463 tree op0, op1, add = NULL_TREE; 3464 3465 if (TREE_CODE (off) == SSA_NAME) 3466 { 3467 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3468 3469 if (expr_invariant_in_loop_p (loop, off)) 3470 return false; 3471 3472 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3473 break; 3474 3475 op0 = gimple_assign_rhs1 (def_stmt); 3476 code = gimple_assign_rhs_code (def_stmt); 3477 op1 = gimple_assign_rhs2 (def_stmt); 3478 } 3479 else 3480 { 3481 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3482 return false; 3483 code = TREE_CODE (off); 3484 extract_ops_from_tree (off, &code, &op0, &op1); 3485 } 3486 switch (code) 3487 { 3488 case POINTER_PLUS_EXPR: 3489 case PLUS_EXPR: 3490 if (expr_invariant_in_loop_p (loop, op0)) 3491 { 3492 add = op0; 3493 off = op1; 3494 do_add: 3495 add = fold_convert (sizetype, add); 3496 if (scale != 1) 3497 add = size_binop (MULT_EXPR, add, size_int (scale)); 3498 base = size_binop (PLUS_EXPR, base, add); 3499 continue; 3500 } 3501 if (expr_invariant_in_loop_p (loop, op1)) 3502 { 3503 add = op1; 3504 off = op0; 3505 goto do_add; 3506 } 3507 break; 3508 case MINUS_EXPR: 3509 if (expr_invariant_in_loop_p (loop, op1)) 3510 { 3511 add = fold_convert (sizetype, op1); 3512 add = size_binop (MINUS_EXPR, size_zero_node, add); 3513 off = op0; 3514 goto do_add; 3515 } 3516 break; 3517 case MULT_EXPR: 3518 if (scale == 1 && tree_fits_shwi_p (op1)) 3519 { 3520 scale = tree_to_shwi (op1); 3521 off = op0; 3522 continue; 3523 } 3524 break; 3525 case SSA_NAME: 3526 off = op0; 3527 continue; 3528 CASE_CONVERT: 3529 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3530 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3531 break; 3532 if (TYPE_PRECISION (TREE_TYPE (op0)) 3533 == TYPE_PRECISION (TREE_TYPE (off))) 3534 { 3535 off = op0; 3536 continue; 3537 } 3538 if (TYPE_PRECISION (TREE_TYPE (op0)) 3539 < TYPE_PRECISION (TREE_TYPE (off))) 3540 { 3541 off = op0; 3542 offtype = TREE_TYPE (off); 3543 STRIP_NOPS (off); 3544 continue; 3545 } 3546 break; 3547 default: 3548 break; 3549 } 3550 break; 3551 } 3552 3553 /* If at the end OFF still isn't a SSA_NAME or isn't 3554 defined in the loop, punt. */ 3555 if (TREE_CODE (off) != SSA_NAME 3556 || expr_invariant_in_loop_p (loop, off)) 3557 return false; 3558 3559 if (offtype == NULL_TREE) 3560 offtype = TREE_TYPE (off); 3561 3562 if (DR_IS_READ (dr)) 3563 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), 3564 offtype, scale); 3565 else 3566 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info), 3567 offtype, scale); 3568 3569 if (decl == NULL_TREE) 3570 return false; 3571 3572 info->decl = decl; 3573 info->base = base; 3574 info->offset = off; 3575 info->offset_dt = vect_unknown_def_type; 3576 info->offset_vectype = NULL_TREE; 3577 info->scale = scale; 3578 return true; 3579 } 3580 3581 /* Function vect_analyze_data_refs. 3582 3583 Find all the data references in the loop or basic block. 3584 3585 The general structure of the analysis of data refs in the vectorizer is as 3586 follows: 3587 1- vect_analyze_data_refs(loop/bb): call 3588 compute_data_dependences_for_loop/bb to find and analyze all data-refs 3589 in the loop/bb and their dependences. 3590 2- vect_analyze_dependences(): apply dependence testing using ddrs. 3591 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 3592 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 3593 3594 */ 3595 3596 bool 3597 vect_analyze_data_refs (vec_info *vinfo, int *min_vf) 3598 { 3599 struct loop *loop = NULL; 3600 unsigned int i; 3601 struct data_reference *dr; 3602 tree scalar_type; 3603 3604 if (dump_enabled_p ()) 3605 dump_printf_loc (MSG_NOTE, vect_location, 3606 "=== vect_analyze_data_refs ===\n"); 3607 3608 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 3609 loop = LOOP_VINFO_LOOP (loop_vinfo); 3610 3611 /* Go through the data-refs, check that the analysis succeeded. Update 3612 pointer from stmt_vec_info struct to DR and vectype. */ 3613 3614 vec<data_reference_p> datarefs = vinfo->datarefs; 3615 FOR_EACH_VEC_ELT (datarefs, i, dr) 3616 { 3617 gimple *stmt; 3618 stmt_vec_info stmt_info; 3619 tree base, offset, init; 3620 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 3621 bool simd_lane_access = false; 3622 int vf; 3623 3624 again: 3625 if (!dr || !DR_REF (dr)) 3626 { 3627 if (dump_enabled_p ()) 3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3629 "not vectorized: unhandled data-ref\n"); 3630 return false; 3631 } 3632 3633 stmt = DR_STMT (dr); 3634 stmt_info = vinfo_for_stmt (stmt); 3635 3636 /* Discard clobbers from the dataref vector. We will remove 3637 clobber stmts during vectorization. */ 3638 if (gimple_clobber_p (stmt)) 3639 { 3640 free_data_ref (dr); 3641 if (i == datarefs.length () - 1) 3642 { 3643 datarefs.pop (); 3644 break; 3645 } 3646 datarefs.ordered_remove (i); 3647 dr = datarefs[i]; 3648 goto again; 3649 } 3650 3651 /* Check that analysis of the data-ref succeeded. */ 3652 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 3653 || !DR_STEP (dr)) 3654 { 3655 bool maybe_gather 3656 = DR_IS_READ (dr) 3657 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3658 && targetm.vectorize.builtin_gather != NULL; 3659 bool maybe_scatter 3660 = DR_IS_WRITE (dr) 3661 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3662 && targetm.vectorize.builtin_scatter != NULL; 3663 bool maybe_simd_lane_access 3664 = is_a <loop_vec_info> (vinfo) && loop->simduid; 3665 3666 /* If target supports vector gather loads or scatter stores, or if 3667 this might be a SIMD lane access, see if they can't be used. */ 3668 if (is_a <loop_vec_info> (vinfo) 3669 && (maybe_gather || maybe_scatter || maybe_simd_lane_access) 3670 && !nested_in_vect_loop_p (loop, stmt)) 3671 { 3672 struct data_reference *newdr 3673 = create_data_ref (NULL, loop_containing_stmt (stmt), 3674 DR_REF (dr), stmt, maybe_scatter ? false : true); 3675 gcc_assert (newdr != NULL && DR_REF (newdr)); 3676 if (DR_BASE_ADDRESS (newdr) 3677 && DR_OFFSET (newdr) 3678 && DR_INIT (newdr) 3679 && DR_STEP (newdr) 3680 && integer_zerop (DR_STEP (newdr))) 3681 { 3682 if (maybe_simd_lane_access) 3683 { 3684 tree off = DR_OFFSET (newdr); 3685 STRIP_NOPS (off); 3686 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 3687 && TREE_CODE (off) == MULT_EXPR 3688 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 3689 { 3690 tree step = TREE_OPERAND (off, 1); 3691 off = TREE_OPERAND (off, 0); 3692 STRIP_NOPS (off); 3693 if (CONVERT_EXPR_P (off) 3694 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 3695 0))) 3696 < TYPE_PRECISION (TREE_TYPE (off))) 3697 off = TREE_OPERAND (off, 0); 3698 if (TREE_CODE (off) == SSA_NAME) 3699 { 3700 gimple *def = SSA_NAME_DEF_STMT (off); 3701 tree reft = TREE_TYPE (DR_REF (newdr)); 3702 if (is_gimple_call (def) 3703 && gimple_call_internal_p (def) 3704 && (gimple_call_internal_fn (def) 3705 == IFN_GOMP_SIMD_LANE)) 3706 { 3707 tree arg = gimple_call_arg (def, 0); 3708 gcc_assert (TREE_CODE (arg) == SSA_NAME); 3709 arg = SSA_NAME_VAR (arg); 3710 if (arg == loop->simduid 3711 /* For now. */ 3712 && tree_int_cst_equal 3713 (TYPE_SIZE_UNIT (reft), 3714 step)) 3715 { 3716 DR_OFFSET (newdr) = ssize_int (0); 3717 DR_STEP (newdr) = step; 3718 DR_ALIGNED_TO (newdr) 3719 = size_int (BIGGEST_ALIGNMENT); 3720 dr = newdr; 3721 simd_lane_access = true; 3722 } 3723 } 3724 } 3725 } 3726 } 3727 if (!simd_lane_access && (maybe_gather || maybe_scatter)) 3728 { 3729 dr = newdr; 3730 if (maybe_gather) 3731 gatherscatter = GATHER; 3732 else 3733 gatherscatter = SCATTER; 3734 } 3735 } 3736 if (gatherscatter == SG_NONE && !simd_lane_access) 3737 free_data_ref (newdr); 3738 } 3739 3740 if (gatherscatter == SG_NONE && !simd_lane_access) 3741 { 3742 if (dump_enabled_p ()) 3743 { 3744 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3745 "not vectorized: data ref analysis " 3746 "failed "); 3747 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3748 } 3749 3750 if (is_a <bb_vec_info> (vinfo)) 3751 break; 3752 3753 return false; 3754 } 3755 } 3756 3757 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 3758 { 3759 if (dump_enabled_p ()) 3760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3761 "not vectorized: base addr of dr is a " 3762 "constant\n"); 3763 3764 if (is_a <bb_vec_info> (vinfo)) 3765 break; 3766 3767 if (gatherscatter != SG_NONE || simd_lane_access) 3768 free_data_ref (dr); 3769 return false; 3770 } 3771 3772 if (TREE_THIS_VOLATILE (DR_REF (dr))) 3773 { 3774 if (dump_enabled_p ()) 3775 { 3776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3777 "not vectorized: volatile type "); 3778 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3779 } 3780 3781 if (is_a <bb_vec_info> (vinfo)) 3782 break; 3783 3784 return false; 3785 } 3786 3787 if (stmt_can_throw_internal (stmt)) 3788 { 3789 if (dump_enabled_p ()) 3790 { 3791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3792 "not vectorized: statement can throw an " 3793 "exception "); 3794 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3795 } 3796 3797 if (is_a <bb_vec_info> (vinfo)) 3798 break; 3799 3800 if (gatherscatter != SG_NONE || simd_lane_access) 3801 free_data_ref (dr); 3802 return false; 3803 } 3804 3805 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 3806 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 3807 { 3808 if (dump_enabled_p ()) 3809 { 3810 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3811 "not vectorized: statement is bitfield " 3812 "access "); 3813 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3814 } 3815 3816 if (is_a <bb_vec_info> (vinfo)) 3817 break; 3818 3819 if (gatherscatter != SG_NONE || simd_lane_access) 3820 free_data_ref (dr); 3821 return false; 3822 } 3823 3824 base = unshare_expr (DR_BASE_ADDRESS (dr)); 3825 offset = unshare_expr (DR_OFFSET (dr)); 3826 init = unshare_expr (DR_INIT (dr)); 3827 3828 if (is_gimple_call (stmt) 3829 && (!gimple_call_internal_p (stmt) 3830 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD 3831 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) 3832 { 3833 if (dump_enabled_p ()) 3834 { 3835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3836 "not vectorized: dr in a call "); 3837 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3838 } 3839 3840 if (is_a <bb_vec_info> (vinfo)) 3841 break; 3842 3843 if (gatherscatter != SG_NONE || simd_lane_access) 3844 free_data_ref (dr); 3845 return false; 3846 } 3847 3848 /* Update DR field in stmt_vec_info struct. */ 3849 3850 /* If the dataref is in an inner-loop of the loop that is considered for 3851 for vectorization, we also want to analyze the access relative to 3852 the outer-loop (DR contains information only relative to the 3853 inner-most enclosing loop). We do that by building a reference to the 3854 first location accessed by the inner-loop, and analyze it relative to 3855 the outer-loop. */ 3856 if (loop && nested_in_vect_loop_p (loop, stmt)) 3857 { 3858 tree outer_step, outer_base, outer_init; 3859 HOST_WIDE_INT pbitsize, pbitpos; 3860 tree poffset; 3861 machine_mode pmode; 3862 int punsignedp, preversep, pvolatilep; 3863 affine_iv base_iv, offset_iv; 3864 tree dinit; 3865 3866 /* Build a reference to the first location accessed by the 3867 inner-loop: *(BASE+INIT). (The first location is actually 3868 BASE+INIT+OFFSET, but we add OFFSET separately later). */ 3869 tree inner_base = build_fold_indirect_ref 3870 (fold_build_pointer_plus (base, init)); 3871 3872 if (dump_enabled_p ()) 3873 { 3874 dump_printf_loc (MSG_NOTE, vect_location, 3875 "analyze in outer-loop: "); 3876 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); 3877 dump_printf (MSG_NOTE, "\n"); 3878 } 3879 3880 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 3881 &poffset, &pmode, &punsignedp, 3882 &preversep, &pvolatilep); 3883 gcc_assert (outer_base != NULL_TREE); 3884 3885 if (pbitpos % BITS_PER_UNIT != 0) 3886 { 3887 if (dump_enabled_p ()) 3888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3889 "failed: bit offset alignment.\n"); 3890 return false; 3891 } 3892 3893 if (preversep) 3894 { 3895 if (dump_enabled_p ()) 3896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3897 "failed: reverse storage order.\n"); 3898 return false; 3899 } 3900 3901 outer_base = build_fold_addr_expr (outer_base); 3902 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, 3903 &base_iv, false)) 3904 { 3905 if (dump_enabled_p ()) 3906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3907 "failed: evolution of base is not affine.\n"); 3908 return false; 3909 } 3910 3911 if (offset) 3912 { 3913 if (poffset) 3914 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, 3915 poffset); 3916 else 3917 poffset = offset; 3918 } 3919 3920 if (!poffset) 3921 { 3922 offset_iv.base = ssize_int (0); 3923 offset_iv.step = ssize_int (0); 3924 } 3925 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, 3926 &offset_iv, false)) 3927 { 3928 if (dump_enabled_p ()) 3929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3930 "evolution of offset is not affine.\n"); 3931 return false; 3932 } 3933 3934 outer_init = ssize_int (pbitpos / BITS_PER_UNIT); 3935 split_constant_offset (base_iv.base, &base_iv.base, &dinit); 3936 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3937 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); 3938 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3939 3940 outer_step = size_binop (PLUS_EXPR, 3941 fold_convert (ssizetype, base_iv.step), 3942 fold_convert (ssizetype, offset_iv.step)); 3943 3944 STMT_VINFO_DR_STEP (stmt_info) = outer_step; 3945 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ 3946 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 3947 STMT_VINFO_DR_INIT (stmt_info) = outer_init; 3948 STMT_VINFO_DR_OFFSET (stmt_info) = 3949 fold_convert (ssizetype, offset_iv.base); 3950 STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 3951 size_int (highest_pow2_factor (offset_iv.base)); 3952 3953 if (dump_enabled_p ()) 3954 { 3955 dump_printf_loc (MSG_NOTE, vect_location, 3956 "\touter base_address: "); 3957 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3958 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3959 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 3960 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3961 STMT_VINFO_DR_OFFSET (stmt_info)); 3962 dump_printf (MSG_NOTE, 3963 "\n\touter constant offset from base address: "); 3964 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3965 STMT_VINFO_DR_INIT (stmt_info)); 3966 dump_printf (MSG_NOTE, "\n\touter step: "); 3967 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3968 STMT_VINFO_DR_STEP (stmt_info)); 3969 dump_printf (MSG_NOTE, "\n\touter aligned to: "); 3970 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3971 STMT_VINFO_DR_ALIGNED_TO (stmt_info)); 3972 dump_printf (MSG_NOTE, "\n"); 3973 } 3974 } 3975 3976 if (STMT_VINFO_DATA_REF (stmt_info)) 3977 { 3978 if (dump_enabled_p ()) 3979 { 3980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3981 "not vectorized: more than one data ref " 3982 "in stmt: "); 3983 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3984 } 3985 3986 if (is_a <bb_vec_info> (vinfo)) 3987 break; 3988 3989 if (gatherscatter != SG_NONE || simd_lane_access) 3990 free_data_ref (dr); 3991 return false; 3992 } 3993 3994 STMT_VINFO_DATA_REF (stmt_info) = dr; 3995 if (simd_lane_access) 3996 { 3997 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; 3998 free_data_ref (datarefs[i]); 3999 datarefs[i] = dr; 4000 } 4001 4002 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR 4003 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)) 4004 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))) 4005 { 4006 if (dump_enabled_p ()) 4007 { 4008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4009 "not vectorized: base object not addressable " 4010 "for stmt: "); 4011 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4012 } 4013 if (is_a <bb_vec_info> (vinfo)) 4014 { 4015 /* In BB vectorization the ref can still participate 4016 in dependence analysis, we just can't vectorize it. */ 4017 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4018 continue; 4019 } 4020 return false; 4021 } 4022 4023 /* Set vectype for STMT. */ 4024 scalar_type = TREE_TYPE (DR_REF (dr)); 4025 STMT_VINFO_VECTYPE (stmt_info) 4026 = get_vectype_for_scalar_type (scalar_type); 4027 if (!STMT_VINFO_VECTYPE (stmt_info)) 4028 { 4029 if (dump_enabled_p ()) 4030 { 4031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4032 "not vectorized: no vectype for stmt: "); 4033 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4034 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4035 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4036 scalar_type); 4037 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4038 } 4039 4040 if (is_a <bb_vec_info> (vinfo)) 4041 { 4042 /* No vector type is fine, the ref can still participate 4043 in dependence analysis, we just can't vectorize it. */ 4044 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4045 continue; 4046 } 4047 4048 if (gatherscatter != SG_NONE || simd_lane_access) 4049 { 4050 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4051 if (gatherscatter != SG_NONE) 4052 free_data_ref (dr); 4053 } 4054 return false; 4055 } 4056 else 4057 { 4058 if (dump_enabled_p ()) 4059 { 4060 dump_printf_loc (MSG_NOTE, vect_location, 4061 "got vectype for stmt: "); 4062 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); 4063 dump_generic_expr (MSG_NOTE, TDF_SLIM, 4064 STMT_VINFO_VECTYPE (stmt_info)); 4065 dump_printf (MSG_NOTE, "\n"); 4066 } 4067 } 4068 4069 /* Adjust the minimal vectorization factor according to the 4070 vector type. */ 4071 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 4072 if (vf > *min_vf) 4073 *min_vf = vf; 4074 4075 if (gatherscatter != SG_NONE) 4076 { 4077 gather_scatter_info gs_info; 4078 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), 4079 &gs_info) 4080 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) 4081 { 4082 STMT_VINFO_DATA_REF (stmt_info) = NULL; 4083 free_data_ref (dr); 4084 if (dump_enabled_p ()) 4085 { 4086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4087 (gatherscatter == GATHER) ? 4088 "not vectorized: not suitable for gather " 4089 "load " : 4090 "not vectorized: not suitable for scatter " 4091 "store "); 4092 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4093 } 4094 return false; 4095 } 4096 4097 free_data_ref (datarefs[i]); 4098 datarefs[i] = dr; 4099 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4100 } 4101 4102 else if (is_a <loop_vec_info> (vinfo) 4103 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4104 { 4105 if (nested_in_vect_loop_p (loop, stmt)) 4106 { 4107 if (dump_enabled_p ()) 4108 { 4109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4110 "not vectorized: not suitable for strided " 4111 "load "); 4112 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 4113 } 4114 return false; 4115 } 4116 STMT_VINFO_STRIDED_P (stmt_info) = true; 4117 } 4118 } 4119 4120 /* If we stopped analysis at the first dataref we could not analyze 4121 when trying to vectorize a basic-block mark the rest of the datarefs 4122 as not vectorizable and truncate the vector of datarefs. That 4123 avoids spending useless time in analyzing their dependence. */ 4124 if (i != datarefs.length ()) 4125 { 4126 gcc_assert (is_a <bb_vec_info> (vinfo)); 4127 for (unsigned j = i; j < datarefs.length (); ++j) 4128 { 4129 data_reference_p dr = datarefs[j]; 4130 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 4131 free_data_ref (dr); 4132 } 4133 datarefs.truncate (i); 4134 } 4135 4136 return true; 4137 } 4138 4139 4140 /* Function vect_get_new_vect_var. 4141 4142 Returns a name for a new variable. The current naming scheme appends the 4143 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4144 the name of vectorizer generated variables, and appends that to NAME if 4145 provided. */ 4146 4147 tree 4148 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4149 { 4150 const char *prefix; 4151 tree new_vect_var; 4152 4153 switch (var_kind) 4154 { 4155 case vect_simple_var: 4156 prefix = "vect"; 4157 break; 4158 case vect_scalar_var: 4159 prefix = "stmp"; 4160 break; 4161 case vect_mask_var: 4162 prefix = "mask"; 4163 break; 4164 case vect_pointer_var: 4165 prefix = "vectp"; 4166 break; 4167 default: 4168 gcc_unreachable (); 4169 } 4170 4171 if (name) 4172 { 4173 char* tmp = concat (prefix, "_", name, NULL); 4174 new_vect_var = create_tmp_reg (type, tmp); 4175 free (tmp); 4176 } 4177 else 4178 new_vect_var = create_tmp_reg (type, prefix); 4179 4180 return new_vect_var; 4181 } 4182 4183 /* Like vect_get_new_vect_var but return an SSA name. */ 4184 4185 tree 4186 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4187 { 4188 const char *prefix; 4189 tree new_vect_var; 4190 4191 switch (var_kind) 4192 { 4193 case vect_simple_var: 4194 prefix = "vect"; 4195 break; 4196 case vect_scalar_var: 4197 prefix = "stmp"; 4198 break; 4199 case vect_pointer_var: 4200 prefix = "vectp"; 4201 break; 4202 default: 4203 gcc_unreachable (); 4204 } 4205 4206 if (name) 4207 { 4208 char* tmp = concat (prefix, "_", name, NULL); 4209 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4210 free (tmp); 4211 } 4212 else 4213 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4214 4215 return new_vect_var; 4216 } 4217 4218 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ 4219 4220 static void 4221 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr, 4222 stmt_vec_info stmt_info) 4223 { 4224 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); 4225 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); 4226 int misalign = DR_MISALIGNMENT (dr); 4227 if (misalign == -1) 4228 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4229 else 4230 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign); 4231 } 4232 4233 /* Function vect_create_addr_base_for_vector_ref. 4234 4235 Create an expression that computes the address of the first memory location 4236 that will be accessed for a data reference. 4237 4238 Input: 4239 STMT: The statement containing the data reference. 4240 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4241 OFFSET: Optional. If supplied, it is be added to the initial address. 4242 LOOP: Specify relative to which loop-nest should the address be computed. 4243 For example, when the dataref is in an inner-loop nested in an 4244 outer-loop that is now being vectorized, LOOP can be either the 4245 outer-loop, or the inner-loop. The first memory location accessed 4246 by the following dataref ('in' points to short): 4247 4248 for (i=0; i<N; i++) 4249 for (j=0; j<M; j++) 4250 s += in[i+j] 4251 4252 is as follows: 4253 if LOOP=i_loop: &in (relative to i_loop) 4254 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4255 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4256 initial address. Unlike OFFSET, which is number of elements to 4257 be added, BYTE_OFFSET is measured in bytes. 4258 4259 Output: 4260 1. Return an SSA_NAME whose value is the address of the memory location of 4261 the first vector of the data reference. 4262 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4263 these statement(s) which define the returned SSA_NAME. 4264 4265 FORNOW: We are only handling array accesses with step 1. */ 4266 4267 tree 4268 vect_create_addr_base_for_vector_ref (gimple *stmt, 4269 gimple_seq *new_stmt_list, 4270 tree offset, 4271 struct loop *loop, 4272 tree byte_offset) 4273 { 4274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4275 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4276 tree data_ref_base; 4277 const char *base_name; 4278 tree addr_base; 4279 tree dest; 4280 gimple_seq seq = NULL; 4281 tree base_offset; 4282 tree init; 4283 tree vect_ptr_type; 4284 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4285 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4286 4287 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) 4288 { 4289 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); 4290 4291 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); 4292 4293 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 4294 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); 4295 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); 4296 } 4297 else 4298 { 4299 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); 4300 base_offset = unshare_expr (DR_OFFSET (dr)); 4301 init = unshare_expr (DR_INIT (dr)); 4302 } 4303 4304 if (loop_vinfo) 4305 base_name = get_name (data_ref_base); 4306 else 4307 { 4308 base_offset = ssize_int (0); 4309 init = ssize_int (0); 4310 base_name = get_name (DR_REF (dr)); 4311 } 4312 4313 /* Create base_offset */ 4314 base_offset = size_binop (PLUS_EXPR, 4315 fold_convert (sizetype, base_offset), 4316 fold_convert (sizetype, init)); 4317 4318 if (offset) 4319 { 4320 offset = fold_build2 (MULT_EXPR, sizetype, 4321 fold_convert (sizetype, offset), step); 4322 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4323 base_offset, offset); 4324 } 4325 if (byte_offset) 4326 { 4327 byte_offset = fold_convert (sizetype, byte_offset); 4328 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4329 base_offset, byte_offset); 4330 } 4331 4332 /* base + base_offset */ 4333 if (loop_vinfo) 4334 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4335 else 4336 { 4337 addr_base = build1 (ADDR_EXPR, 4338 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4339 unshare_expr (DR_REF (dr))); 4340 } 4341 4342 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4343 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4344 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4345 gimple_seq_add_seq (new_stmt_list, seq); 4346 4347 if (DR_PTR_INFO (dr) 4348 && TREE_CODE (addr_base) == SSA_NAME 4349 && !SSA_NAME_PTR_INFO (addr_base)) 4350 { 4351 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info); 4352 if (offset || byte_offset) 4353 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4354 } 4355 4356 if (dump_enabled_p ()) 4357 { 4358 dump_printf_loc (MSG_NOTE, vect_location, "created "); 4359 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); 4360 dump_printf (MSG_NOTE, "\n"); 4361 } 4362 4363 return addr_base; 4364 } 4365 4366 4367 /* Function vect_create_data_ref_ptr. 4368 4369 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4370 location accessed in the loop by STMT, along with the def-use update 4371 chain to appropriately advance the pointer through the loop iterations. 4372 Also set aliasing information for the pointer. This pointer is used by 4373 the callers to this function to create a memory reference expression for 4374 vector load/store access. 4375 4376 Input: 4377 1. STMT: a stmt that references memory. Expected to be of the form 4378 GIMPLE_ASSIGN <name, data-ref> or 4379 GIMPLE_ASSIGN <data-ref, name>. 4380 2. AGGR_TYPE: the type of the reference, which should be either a vector 4381 or an array. 4382 3. AT_LOOP: the loop where the vector memref is to be created. 4383 4. OFFSET (optional): an offset to be added to the initial address accessed 4384 by the data-ref in STMT. 4385 5. BSI: location where the new stmts are to be placed if there is no loop 4386 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4387 pointing to the initial address. 4388 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4389 to the initial address accessed by the data-ref in STMT. This is 4390 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4391 in bytes. 4392 4393 Output: 4394 1. Declare a new ptr to vector_type, and have it point to the base of the 4395 data reference (initial addressed accessed by the data reference). 4396 For example, for vector of type V8HI, the following code is generated: 4397 4398 v8hi *ap; 4399 ap = (v8hi *)initial_address; 4400 4401 if OFFSET is not supplied: 4402 initial_address = &a[init]; 4403 if OFFSET is supplied: 4404 initial_address = &a[init + OFFSET]; 4405 if BYTE_OFFSET is supplied: 4406 initial_address = &a[init] + BYTE_OFFSET; 4407 4408 Return the initial_address in INITIAL_ADDRESS. 4409 4410 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4411 update the pointer in each iteration of the loop. 4412 4413 Return the increment stmt that updates the pointer in PTR_INCR. 4414 4415 3. Set INV_P to true if the access pattern of the data reference in the 4416 vectorized loop is invariant. Set it to false otherwise. 4417 4418 4. Return the pointer. */ 4419 4420 tree 4421 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, 4422 tree offset, tree *initial_address, 4423 gimple_stmt_iterator *gsi, gimple **ptr_incr, 4424 bool only_init, bool *inv_p, tree byte_offset) 4425 { 4426 const char *base_name; 4427 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4428 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4429 struct loop *loop = NULL; 4430 bool nested_in_vect_loop = false; 4431 struct loop *containing_loop = NULL; 4432 tree aggr_ptr_type; 4433 tree aggr_ptr; 4434 tree new_temp; 4435 gimple_seq new_stmt_list = NULL; 4436 edge pe = NULL; 4437 basic_block new_bb; 4438 tree aggr_ptr_init; 4439 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4440 tree aptr; 4441 gimple_stmt_iterator incr_gsi; 4442 bool insert_after; 4443 tree indx_before_incr, indx_after_incr; 4444 gimple *incr; 4445 tree step; 4446 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4447 4448 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE 4449 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4450 4451 if (loop_vinfo) 4452 { 4453 loop = LOOP_VINFO_LOOP (loop_vinfo); 4454 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4455 containing_loop = (gimple_bb (stmt))->loop_father; 4456 pe = loop_preheader_edge (loop); 4457 } 4458 else 4459 { 4460 gcc_assert (bb_vinfo); 4461 only_init = true; 4462 *ptr_incr = NULL; 4463 } 4464 4465 /* Check the step (evolution) of the load in LOOP, and record 4466 whether it's invariant. */ 4467 if (nested_in_vect_loop) 4468 step = STMT_VINFO_DR_STEP (stmt_info); 4469 else 4470 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); 4471 4472 if (integer_zerop (step)) 4473 *inv_p = true; 4474 else 4475 *inv_p = false; 4476 4477 /* Create an expression for the first address accessed by this load 4478 in LOOP. */ 4479 base_name = get_name (DR_BASE_ADDRESS (dr)); 4480 4481 if (dump_enabled_p ()) 4482 { 4483 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4484 dump_printf_loc (MSG_NOTE, vect_location, 4485 "create %s-pointer variable to type: ", 4486 get_tree_code_name (TREE_CODE (aggr_type))); 4487 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 4488 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4489 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4490 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4491 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4492 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4493 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4494 else 4495 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4496 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 4497 dump_printf (MSG_NOTE, "\n"); 4498 } 4499 4500 /* (1) Create the new aggregate-pointer variable. 4501 Vector and array types inherit the alias set of their component 4502 type by default so we need to use a ref-all pointer if the data 4503 reference does not conflict with the created aggregated data 4504 reference because it is not addressable. */ 4505 bool need_ref_all = false; 4506 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4507 get_alias_set (DR_REF (dr)))) 4508 need_ref_all = true; 4509 /* Likewise for any of the data references in the stmt group. */ 4510 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 4511 { 4512 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 4513 do 4514 { 4515 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); 4516 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4517 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4518 get_alias_set (DR_REF (sdr)))) 4519 { 4520 need_ref_all = true; 4521 break; 4522 } 4523 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); 4524 } 4525 while (orig_stmt); 4526 } 4527 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4528 need_ref_all); 4529 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4530 4531 4532 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4533 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4534 def-use update cycles for the pointer: one relative to the outer-loop 4535 (LOOP), which is what steps (3) and (4) below do. The other is relative 4536 to the inner-loop (which is the inner-most loop containing the dataref), 4537 and this is done be step (5) below. 4538 4539 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4540 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4541 redundant. Steps (3),(4) create the following: 4542 4543 vp0 = &base_addr; 4544 LOOP: vp1 = phi(vp0,vp2) 4545 ... 4546 ... 4547 vp2 = vp1 + step 4548 goto LOOP 4549 4550 If there is an inner-loop nested in loop, then step (5) will also be 4551 applied, and an additional update in the inner-loop will be created: 4552 4553 vp0 = &base_addr; 4554 LOOP: vp1 = phi(vp0,vp2) 4555 ... 4556 inner: vp3 = phi(vp1,vp4) 4557 vp4 = vp3 + inner_step 4558 if () goto inner 4559 ... 4560 vp2 = vp1 + step 4561 if () goto LOOP */ 4562 4563 /* (2) Calculate the initial address of the aggregate-pointer, and set 4564 the aggregate-pointer to point to it before the loop. */ 4565 4566 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4567 4568 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 4569 offset, loop, byte_offset); 4570 if (new_stmt_list) 4571 { 4572 if (pe) 4573 { 4574 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4575 gcc_assert (!new_bb); 4576 } 4577 else 4578 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4579 } 4580 4581 *initial_address = new_temp; 4582 aggr_ptr_init = new_temp; 4583 4584 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4585 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4586 inner-loop nested in LOOP (during outer-loop vectorization). */ 4587 4588 /* No update in loop is required. */ 4589 if (only_init && (!loop_vinfo || at_loop == loop)) 4590 aptr = aggr_ptr_init; 4591 else 4592 { 4593 /* The step of the aggregate pointer is the type size. */ 4594 tree iv_step = TYPE_SIZE_UNIT (aggr_type); 4595 /* One exception to the above is when the scalar step of the load in 4596 LOOP is zero. In this case the step here is also zero. */ 4597 if (*inv_p) 4598 iv_step = size_zero_node; 4599 else if (tree_int_cst_sgn (step) == -1) 4600 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4601 4602 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4603 4604 create_iv (aggr_ptr_init, 4605 fold_convert (aggr_ptr_type, iv_step), 4606 aggr_ptr, loop, &incr_gsi, insert_after, 4607 &indx_before_incr, &indx_after_incr); 4608 incr = gsi_stmt (incr_gsi); 4609 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4610 4611 /* Copy the points-to information if it exists. */ 4612 if (DR_PTR_INFO (dr)) 4613 { 4614 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4615 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4616 } 4617 if (ptr_incr) 4618 *ptr_incr = incr; 4619 4620 aptr = indx_before_incr; 4621 } 4622 4623 if (!nested_in_vect_loop || only_init) 4624 return aptr; 4625 4626 4627 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4628 nested in LOOP, if exists. */ 4629 4630 gcc_assert (nested_in_vect_loop); 4631 if (!only_init) 4632 { 4633 standard_iv_increment_position (containing_loop, &incr_gsi, 4634 &insert_after); 4635 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4636 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4637 &indx_after_incr); 4638 incr = gsi_stmt (incr_gsi); 4639 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); 4640 4641 /* Copy the points-to information if it exists. */ 4642 if (DR_PTR_INFO (dr)) 4643 { 4644 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); 4645 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); 4646 } 4647 if (ptr_incr) 4648 *ptr_incr = incr; 4649 4650 return indx_before_incr; 4651 } 4652 else 4653 gcc_unreachable (); 4654 } 4655 4656 4657 /* Function bump_vector_ptr 4658 4659 Increment a pointer (to a vector type) by vector-size. If requested, 4660 i.e. if PTR-INCR is given, then also connect the new increment stmt 4661 to the existing def-use update-chain of the pointer, by modifying 4662 the PTR_INCR as illustrated below: 4663 4664 The pointer def-use update-chain before this function: 4665 DATAREF_PTR = phi (p_0, p_2) 4666 .... 4667 PTR_INCR: p_2 = DATAREF_PTR + step 4668 4669 The pointer def-use update-chain after this function: 4670 DATAREF_PTR = phi (p_0, p_2) 4671 .... 4672 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4673 .... 4674 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4675 4676 Input: 4677 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4678 in the loop. 4679 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4680 the loop. The increment amount across iterations is expected 4681 to be vector_size. 4682 BSI - location where the new update stmt is to be placed. 4683 STMT - the original scalar memory-access stmt that is being vectorized. 4684 BUMP - optional. The offset by which to bump the pointer. If not given, 4685 the offset is assumed to be vector_size. 4686 4687 Output: Return NEW_DATAREF_PTR as illustrated above. 4688 4689 */ 4690 4691 tree 4692 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4693 gimple *stmt, tree bump) 4694 { 4695 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4696 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4697 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4698 tree update = TYPE_SIZE_UNIT (vectype); 4699 gassign *incr_stmt; 4700 ssa_op_iter iter; 4701 use_operand_p use_p; 4702 tree new_dataref_ptr; 4703 4704 if (bump) 4705 update = bump; 4706 4707 if (TREE_CODE (dataref_ptr) == SSA_NAME) 4708 new_dataref_ptr = copy_ssa_name (dataref_ptr); 4709 else 4710 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 4711 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 4712 dataref_ptr, update); 4713 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4714 4715 /* Copy the points-to information if it exists. */ 4716 if (DR_PTR_INFO (dr)) 4717 { 4718 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4719 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4720 } 4721 4722 if (!ptr_incr) 4723 return new_dataref_ptr; 4724 4725 /* Update the vector-pointer's cross-iteration increment. */ 4726 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4727 { 4728 tree use = USE_FROM_PTR (use_p); 4729 4730 if (use == dataref_ptr) 4731 SET_USE (use_p, new_dataref_ptr); 4732 else 4733 gcc_assert (tree_int_cst_compare (use, update) == 0); 4734 } 4735 4736 return new_dataref_ptr; 4737 } 4738 4739 4740 /* Function vect_create_destination_var. 4741 4742 Create a new temporary of type VECTYPE. */ 4743 4744 tree 4745 vect_create_destination_var (tree scalar_dest, tree vectype) 4746 { 4747 tree vec_dest; 4748 const char *name; 4749 char *new_name; 4750 tree type; 4751 enum vect_var_kind kind; 4752 4753 kind = vectype 4754 ? VECTOR_BOOLEAN_TYPE_P (vectype) 4755 ? vect_mask_var 4756 : vect_simple_var 4757 : vect_scalar_var; 4758 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4759 4760 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4761 4762 name = get_name (scalar_dest); 4763 if (name) 4764 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 4765 else 4766 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 4767 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4768 free (new_name); 4769 4770 return vec_dest; 4771 } 4772 4773 /* Function vect_grouped_store_supported. 4774 4775 Returns TRUE if interleave high and interleave low permutations 4776 are supported, and FALSE otherwise. */ 4777 4778 bool 4779 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4780 { 4781 machine_mode mode = TYPE_MODE (vectype); 4782 4783 /* vect_permute_store_chain requires the group size to be equal to 3 or 4784 be a power of two. */ 4785 if (count != 3 && exact_log2 (count) == -1) 4786 { 4787 if (dump_enabled_p ()) 4788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4789 "the size of the group of accesses" 4790 " is not a power of 2 or not eqaul to 3\n"); 4791 return false; 4792 } 4793 4794 /* Check that the permutation is supported. */ 4795 if (VECTOR_MODE_P (mode)) 4796 { 4797 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4798 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4799 4800 if (count == 3) 4801 { 4802 unsigned int j0 = 0, j1 = 0, j2 = 0; 4803 unsigned int i, j; 4804 4805 for (j = 0; j < 3; j++) 4806 { 4807 int nelt0 = ((3 - j) * nelt) % 3; 4808 int nelt1 = ((3 - j) * nelt + 1) % 3; 4809 int nelt2 = ((3 - j) * nelt + 2) % 3; 4810 for (i = 0; i < nelt; i++) 4811 { 4812 if (3 * i + nelt0 < nelt) 4813 sel[3 * i + nelt0] = j0++; 4814 if (3 * i + nelt1 < nelt) 4815 sel[3 * i + nelt1] = nelt + j1++; 4816 if (3 * i + nelt2 < nelt) 4817 sel[3 * i + nelt2] = 0; 4818 } 4819 if (!can_vec_perm_p (mode, false, sel)) 4820 { 4821 if (dump_enabled_p ()) 4822 dump_printf (MSG_MISSED_OPTIMIZATION, 4823 "permutaion op not supported by target.\n"); 4824 return false; 4825 } 4826 4827 for (i = 0; i < nelt; i++) 4828 { 4829 if (3 * i + nelt0 < nelt) 4830 sel[3 * i + nelt0] = 3 * i + nelt0; 4831 if (3 * i + nelt1 < nelt) 4832 sel[3 * i + nelt1] = 3 * i + nelt1; 4833 if (3 * i + nelt2 < nelt) 4834 sel[3 * i + nelt2] = nelt + j2++; 4835 } 4836 if (!can_vec_perm_p (mode, false, sel)) 4837 { 4838 if (dump_enabled_p ()) 4839 dump_printf (MSG_MISSED_OPTIMIZATION, 4840 "permutaion op not supported by target.\n"); 4841 return false; 4842 } 4843 } 4844 return true; 4845 } 4846 else 4847 { 4848 /* If length is not equal to 3 then only power of 2 is supported. */ 4849 gcc_assert (pow2p_hwi (count)); 4850 4851 for (i = 0; i < nelt / 2; i++) 4852 { 4853 sel[i * 2] = i; 4854 sel[i * 2 + 1] = i + nelt; 4855 } 4856 if (can_vec_perm_p (mode, false, sel)) 4857 { 4858 for (i = 0; i < nelt; i++) 4859 sel[i] += nelt / 2; 4860 if (can_vec_perm_p (mode, false, sel)) 4861 return true; 4862 } 4863 } 4864 } 4865 4866 if (dump_enabled_p ()) 4867 dump_printf (MSG_MISSED_OPTIMIZATION, 4868 "permutaion op not supported by target.\n"); 4869 return false; 4870 } 4871 4872 4873 /* Return TRUE if vec_store_lanes is available for COUNT vectors of 4874 type VECTYPE. */ 4875 4876 bool 4877 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4878 { 4879 return vect_lanes_optab_supported_p ("vec_store_lanes", 4880 vec_store_lanes_optab, 4881 vectype, count); 4882 } 4883 4884 4885 /* Function vect_permute_store_chain. 4886 4887 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 4888 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 4889 the data correctly for the stores. Return the final references for stores 4890 in RESULT_CHAIN. 4891 4892 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4893 The input is 4 vectors each containing 8 elements. We assign a number to 4894 each element, the input sequence is: 4895 4896 1st vec: 0 1 2 3 4 5 6 7 4897 2nd vec: 8 9 10 11 12 13 14 15 4898 3rd vec: 16 17 18 19 20 21 22 23 4899 4th vec: 24 25 26 27 28 29 30 31 4900 4901 The output sequence should be: 4902 4903 1st vec: 0 8 16 24 1 9 17 25 4904 2nd vec: 2 10 18 26 3 11 19 27 4905 3rd vec: 4 12 20 28 5 13 21 30 4906 4th vec: 6 14 22 30 7 15 23 31 4907 4908 i.e., we interleave the contents of the four vectors in their order. 4909 4910 We use interleave_high/low instructions to create such output. The input of 4911 each interleave_high/low operation is two vectors: 4912 1st vec 2nd vec 4913 0 1 2 3 4 5 6 7 4914 the even elements of the result vector are obtained left-to-right from the 4915 high/low elements of the first vector. The odd elements of the result are 4916 obtained left-to-right from the high/low elements of the second vector. 4917 The output of interleave_high will be: 0 4 1 5 4918 and of interleave_low: 2 6 3 7 4919 4920 4921 The permutation is done in log LENGTH stages. In each stage interleave_high 4922 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 4923 where the first argument is taken from the first half of DR_CHAIN and the 4924 second argument from it's second half. 4925 In our example, 4926 4927 I1: interleave_high (1st vec, 3rd vec) 4928 I2: interleave_low (1st vec, 3rd vec) 4929 I3: interleave_high (2nd vec, 4th vec) 4930 I4: interleave_low (2nd vec, 4th vec) 4931 4932 The output for the first stage is: 4933 4934 I1: 0 16 1 17 2 18 3 19 4935 I2: 4 20 5 21 6 22 7 23 4936 I3: 8 24 9 25 10 26 11 27 4937 I4: 12 28 13 29 14 30 15 31 4938 4939 The output of the second stage, i.e. the final result is: 4940 4941 I1: 0 8 16 24 1 9 17 25 4942 I2: 2 10 18 26 3 11 19 27 4943 I3: 4 12 20 28 5 13 21 30 4944 I4: 6 14 22 30 7 15 23 31. */ 4945 4946 void 4947 vect_permute_store_chain (vec<tree> dr_chain, 4948 unsigned int length, 4949 gimple *stmt, 4950 gimple_stmt_iterator *gsi, 4951 vec<tree> *result_chain) 4952 { 4953 tree vect1, vect2, high, low; 4954 gimple *perm_stmt; 4955 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4956 tree perm_mask_low, perm_mask_high; 4957 tree data_ref; 4958 tree perm3_mask_low, perm3_mask_high; 4959 unsigned int i, n, log_length = exact_log2 (length); 4960 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); 4961 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4962 4963 result_chain->quick_grow (length); 4964 memcpy (result_chain->address (), dr_chain.address (), 4965 length * sizeof (tree)); 4966 4967 if (length == 3) 4968 { 4969 unsigned int j0 = 0, j1 = 0, j2 = 0; 4970 4971 for (j = 0; j < 3; j++) 4972 { 4973 int nelt0 = ((3 - j) * nelt) % 3; 4974 int nelt1 = ((3 - j) * nelt + 1) % 3; 4975 int nelt2 = ((3 - j) * nelt + 2) % 3; 4976 4977 for (i = 0; i < nelt; i++) 4978 { 4979 if (3 * i + nelt0 < nelt) 4980 sel[3 * i + nelt0] = j0++; 4981 if (3 * i + nelt1 < nelt) 4982 sel[3 * i + nelt1] = nelt + j1++; 4983 if (3 * i + nelt2 < nelt) 4984 sel[3 * i + nelt2] = 0; 4985 } 4986 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 4987 4988 for (i = 0; i < nelt; i++) 4989 { 4990 if (3 * i + nelt0 < nelt) 4991 sel[3 * i + nelt0] = 3 * i + nelt0; 4992 if (3 * i + nelt1 < nelt) 4993 sel[3 * i + nelt1] = 3 * i + nelt1; 4994 if (3 * i + nelt2 < nelt) 4995 sel[3 * i + nelt2] = nelt + j2++; 4996 } 4997 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 4998 4999 vect1 = dr_chain[0]; 5000 vect2 = dr_chain[1]; 5001 5002 /* Create interleaving stmt: 5003 low = VEC_PERM_EXPR <vect1, vect2, 5004 {j, nelt, *, j + 1, nelt + j + 1, *, 5005 j + 2, nelt + j + 2, *, ...}> */ 5006 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5007 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5008 vect2, perm3_mask_low); 5009 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5010 5011 vect1 = data_ref; 5012 vect2 = dr_chain[2]; 5013 /* Create interleaving stmt: 5014 low = VEC_PERM_EXPR <vect1, vect2, 5015 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5016 6, 7, nelt + j + 2, ...}> */ 5017 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5018 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5019 vect2, perm3_mask_high); 5020 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5021 (*result_chain)[j] = data_ref; 5022 } 5023 } 5024 else 5025 { 5026 /* If length is not equal to 3 then only power of 2 is supported. */ 5027 gcc_assert (pow2p_hwi (length)); 5028 5029 for (i = 0, n = nelt / 2; i < n; i++) 5030 { 5031 sel[i * 2] = i; 5032 sel[i * 2 + 1] = i + nelt; 5033 } 5034 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5035 5036 for (i = 0; i < nelt; i++) 5037 sel[i] += nelt / 2; 5038 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5039 5040 for (i = 0, n = log_length; i < n; i++) 5041 { 5042 for (j = 0; j < length/2; j++) 5043 { 5044 vect1 = dr_chain[j]; 5045 vect2 = dr_chain[j+length/2]; 5046 5047 /* Create interleaving stmt: 5048 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5049 ...}> */ 5050 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5051 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5052 vect2, perm_mask_high); 5053 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5054 (*result_chain)[2*j] = high; 5055 5056 /* Create interleaving stmt: 5057 low = VEC_PERM_EXPR <vect1, vect2, 5058 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5059 ...}> */ 5060 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5061 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5062 vect2, perm_mask_low); 5063 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5064 (*result_chain)[2*j+1] = low; 5065 } 5066 memcpy (dr_chain.address (), result_chain->address (), 5067 length * sizeof (tree)); 5068 } 5069 } 5070 } 5071 5072 /* Function vect_setup_realignment 5073 5074 This function is called when vectorizing an unaligned load using 5075 the dr_explicit_realign[_optimized] scheme. 5076 This function generates the following code at the loop prolog: 5077 5078 p = initial_addr; 5079 x msq_init = *(floor(p)); # prolog load 5080 realignment_token = call target_builtin; 5081 loop: 5082 x msq = phi (msq_init, ---) 5083 5084 The stmts marked with x are generated only for the case of 5085 dr_explicit_realign_optimized. 5086 5087 The code above sets up a new (vector) pointer, pointing to the first 5088 location accessed by STMT, and a "floor-aligned" load using that pointer. 5089 It also generates code to compute the "realignment-token" (if the relevant 5090 target hook was defined), and creates a phi-node at the loop-header bb 5091 whose arguments are the result of the prolog-load (created by this 5092 function) and the result of a load that takes place in the loop (to be 5093 created by the caller to this function). 5094 5095 For the case of dr_explicit_realign_optimized: 5096 The caller to this function uses the phi-result (msq) to create the 5097 realignment code inside the loop, and sets up the missing phi argument, 5098 as follows: 5099 loop: 5100 msq = phi (msq_init, lsq) 5101 lsq = *(floor(p')); # load in loop 5102 result = realign_load (msq, lsq, realignment_token); 5103 5104 For the case of dr_explicit_realign: 5105 loop: 5106 msq = *(floor(p)); # load in loop 5107 p' = p + (VS-1); 5108 lsq = *(floor(p')); # load in loop 5109 result = realign_load (msq, lsq, realignment_token); 5110 5111 Input: 5112 STMT - (scalar) load stmt to be vectorized. This load accesses 5113 a memory location that may be unaligned. 5114 BSI - place where new code is to be inserted. 5115 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5116 is used. 5117 5118 Output: 5119 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5120 target hook, if defined. 5121 Return value - the result of the loop-header phi node. */ 5122 5123 tree 5124 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, 5125 tree *realignment_token, 5126 enum dr_alignment_support alignment_support_scheme, 5127 tree init_addr, 5128 struct loop **at_loop) 5129 { 5130 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5131 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5132 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5133 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 5134 struct loop *loop = NULL; 5135 edge pe = NULL; 5136 tree scalar_dest = gimple_assign_lhs (stmt); 5137 tree vec_dest; 5138 gimple *inc; 5139 tree ptr; 5140 tree data_ref; 5141 basic_block new_bb; 5142 tree msq_init = NULL_TREE; 5143 tree new_temp; 5144 gphi *phi_stmt; 5145 tree msq = NULL_TREE; 5146 gimple_seq stmts = NULL; 5147 bool inv_p; 5148 bool compute_in_loop = false; 5149 bool nested_in_vect_loop = false; 5150 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 5151 struct loop *loop_for_initial_load = NULL; 5152 5153 if (loop_vinfo) 5154 { 5155 loop = LOOP_VINFO_LOOP (loop_vinfo); 5156 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 5157 } 5158 5159 gcc_assert (alignment_support_scheme == dr_explicit_realign 5160 || alignment_support_scheme == dr_explicit_realign_optimized); 5161 5162 /* We need to generate three things: 5163 1. the misalignment computation 5164 2. the extra vector load (for the optimized realignment scheme). 5165 3. the phi node for the two vectors from which the realignment is 5166 done (for the optimized realignment scheme). */ 5167 5168 /* 1. Determine where to generate the misalignment computation. 5169 5170 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5171 calculation will be generated by this function, outside the loop (in the 5172 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5173 caller, inside the loop. 5174 5175 Background: If the misalignment remains fixed throughout the iterations of 5176 the loop, then both realignment schemes are applicable, and also the 5177 misalignment computation can be done outside LOOP. This is because we are 5178 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5179 are a multiple of VS (the Vector Size), and therefore the misalignment in 5180 different vectorized LOOP iterations is always the same. 5181 The problem arises only if the memory access is in an inner-loop nested 5182 inside LOOP, which is now being vectorized using outer-loop vectorization. 5183 This is the only case when the misalignment of the memory access may not 5184 remain fixed throughout the iterations of the inner-loop (as explained in 5185 detail in vect_supportable_dr_alignment). In this case, not only is the 5186 optimized realignment scheme not applicable, but also the misalignment 5187 computation (and generation of the realignment token that is passed to 5188 REALIGN_LOAD) have to be done inside the loop. 5189 5190 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5191 or not, which in turn determines if the misalignment is computed inside 5192 the inner-loop, or outside LOOP. */ 5193 5194 if (init_addr != NULL_TREE || !loop_vinfo) 5195 { 5196 compute_in_loop = true; 5197 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5198 } 5199 5200 5201 /* 2. Determine where to generate the extra vector load. 5202 5203 For the optimized realignment scheme, instead of generating two vector 5204 loads in each iteration, we generate a single extra vector load in the 5205 preheader of the loop, and in each iteration reuse the result of the 5206 vector load from the previous iteration. In case the memory access is in 5207 an inner-loop nested inside LOOP, which is now being vectorized using 5208 outer-loop vectorization, we need to determine whether this initial vector 5209 load should be generated at the preheader of the inner-loop, or can be 5210 generated at the preheader of LOOP. If the memory access has no evolution 5211 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5212 to be generated inside LOOP (in the preheader of the inner-loop). */ 5213 5214 if (nested_in_vect_loop) 5215 { 5216 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5217 bool invariant_in_outerloop = 5218 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5219 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5220 } 5221 else 5222 loop_for_initial_load = loop; 5223 if (at_loop) 5224 *at_loop = loop_for_initial_load; 5225 5226 if (loop_for_initial_load) 5227 pe = loop_preheader_edge (loop_for_initial_load); 5228 5229 /* 3. For the case of the optimized realignment, create the first vector 5230 load at the loop preheader. */ 5231 5232 if (alignment_support_scheme == dr_explicit_realign_optimized) 5233 { 5234 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5235 gassign *new_stmt; 5236 5237 gcc_assert (!compute_in_loop); 5238 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5239 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 5240 NULL_TREE, &init_addr, NULL, &inc, 5241 true, &inv_p); 5242 if (TREE_CODE (ptr) == SSA_NAME) 5243 new_temp = copy_ssa_name (ptr); 5244 else 5245 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5246 new_stmt = gimple_build_assign 5247 (new_temp, BIT_AND_EXPR, ptr, 5248 build_int_cst (TREE_TYPE (ptr), 5249 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); 5250 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5251 gcc_assert (!new_bb); 5252 data_ref 5253 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5254 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5255 new_stmt = gimple_build_assign (vec_dest, data_ref); 5256 new_temp = make_ssa_name (vec_dest, new_stmt); 5257 gimple_assign_set_lhs (new_stmt, new_temp); 5258 if (pe) 5259 { 5260 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5261 gcc_assert (!new_bb); 5262 } 5263 else 5264 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5265 5266 msq_init = gimple_assign_lhs (new_stmt); 5267 } 5268 5269 /* 4. Create realignment token using a target builtin, if available. 5270 It is done either inside the containing loop, or before LOOP (as 5271 determined above). */ 5272 5273 if (targetm.vectorize.builtin_mask_for_load) 5274 { 5275 gcall *new_stmt; 5276 tree builtin_decl; 5277 5278 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5279 if (!init_addr) 5280 { 5281 /* Generate the INIT_ADDR computation outside LOOP. */ 5282 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 5283 NULL_TREE, loop); 5284 if (loop) 5285 { 5286 pe = loop_preheader_edge (loop); 5287 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5288 gcc_assert (!new_bb); 5289 } 5290 else 5291 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5292 } 5293 5294 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5295 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5296 vec_dest = 5297 vect_create_destination_var (scalar_dest, 5298 gimple_call_return_type (new_stmt)); 5299 new_temp = make_ssa_name (vec_dest, new_stmt); 5300 gimple_call_set_lhs (new_stmt, new_temp); 5301 5302 if (compute_in_loop) 5303 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5304 else 5305 { 5306 /* Generate the misalignment computation outside LOOP. */ 5307 pe = loop_preheader_edge (loop); 5308 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5309 gcc_assert (!new_bb); 5310 } 5311 5312 *realignment_token = gimple_call_lhs (new_stmt); 5313 5314 /* The result of the CALL_EXPR to this builtin is determined from 5315 the value of the parameter and no global variables are touched 5316 which makes the builtin a "const" function. Requiring the 5317 builtin to have the "const" attribute makes it unnecessary 5318 to call mark_call_clobbered. */ 5319 gcc_assert (TREE_READONLY (builtin_decl)); 5320 } 5321 5322 if (alignment_support_scheme == dr_explicit_realign) 5323 return msq; 5324 5325 gcc_assert (!compute_in_loop); 5326 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5327 5328 5329 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5330 5331 pe = loop_preheader_edge (containing_loop); 5332 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5333 msq = make_ssa_name (vec_dest); 5334 phi_stmt = create_phi_node (msq, containing_loop->header); 5335 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5336 5337 return msq; 5338 } 5339 5340 5341 /* Function vect_grouped_load_supported. 5342 5343 COUNT is the size of the load group (the number of statements plus the 5344 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5345 only one statement, with a gap of COUNT - 1. 5346 5347 Returns true if a suitable permute exists. */ 5348 5349 bool 5350 vect_grouped_load_supported (tree vectype, bool single_element_p, 5351 unsigned HOST_WIDE_INT count) 5352 { 5353 machine_mode mode = TYPE_MODE (vectype); 5354 5355 /* If this is single-element interleaving with an element distance 5356 that leaves unused vector loads around punt - we at least create 5357 very sub-optimal code in that case (and blow up memory, 5358 see PR65518). */ 5359 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype)) 5360 { 5361 if (dump_enabled_p ()) 5362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5363 "single-element interleaving not supported " 5364 "for not adjacent vector loads\n"); 5365 return false; 5366 } 5367 5368 /* vect_permute_load_chain requires the group size to be equal to 3 or 5369 be a power of two. */ 5370 if (count != 3 && exact_log2 (count) == -1) 5371 { 5372 if (dump_enabled_p ()) 5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5374 "the size of the group of accesses" 5375 " is not a power of 2 or not equal to 3\n"); 5376 return false; 5377 } 5378 5379 /* Check that the permutation is supported. */ 5380 if (VECTOR_MODE_P (mode)) 5381 { 5382 unsigned int i, j, nelt = GET_MODE_NUNITS (mode); 5383 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5384 5385 if (count == 3) 5386 { 5387 unsigned int k; 5388 for (k = 0; k < 3; k++) 5389 { 5390 for (i = 0; i < nelt; i++) 5391 if (3 * i + k < 2 * nelt) 5392 sel[i] = 3 * i + k; 5393 else 5394 sel[i] = 0; 5395 if (!can_vec_perm_p (mode, false, sel)) 5396 { 5397 if (dump_enabled_p ()) 5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5399 "shuffle of 3 loads is not supported by" 5400 " target\n"); 5401 return false; 5402 } 5403 for (i = 0, j = 0; i < nelt; i++) 5404 if (3 * i + k < 2 * nelt) 5405 sel[i] = i; 5406 else 5407 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5408 if (!can_vec_perm_p (mode, false, sel)) 5409 { 5410 if (dump_enabled_p ()) 5411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5412 "shuffle of 3 loads is not supported by" 5413 " target\n"); 5414 return false; 5415 } 5416 } 5417 return true; 5418 } 5419 else 5420 { 5421 /* If length is not equal to 3 then only power of 2 is supported. */ 5422 gcc_assert (pow2p_hwi (count)); 5423 for (i = 0; i < nelt; i++) 5424 sel[i] = i * 2; 5425 if (can_vec_perm_p (mode, false, sel)) 5426 { 5427 for (i = 0; i < nelt; i++) 5428 sel[i] = i * 2 + 1; 5429 if (can_vec_perm_p (mode, false, sel)) 5430 return true; 5431 } 5432 } 5433 } 5434 5435 if (dump_enabled_p ()) 5436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5437 "extract even/odd not supported by target\n"); 5438 return false; 5439 } 5440 5441 /* Return TRUE if vec_load_lanes is available for COUNT vectors of 5442 type VECTYPE. */ 5443 5444 bool 5445 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 5446 { 5447 return vect_lanes_optab_supported_p ("vec_load_lanes", 5448 vec_load_lanes_optab, 5449 vectype, count); 5450 } 5451 5452 /* Function vect_permute_load_chain. 5453 5454 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5455 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5456 the input data correctly. Return the final references for loads in 5457 RESULT_CHAIN. 5458 5459 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5460 The input is 4 vectors each containing 8 elements. We assign a number to each 5461 element, the input sequence is: 5462 5463 1st vec: 0 1 2 3 4 5 6 7 5464 2nd vec: 8 9 10 11 12 13 14 15 5465 3rd vec: 16 17 18 19 20 21 22 23 5466 4th vec: 24 25 26 27 28 29 30 31 5467 5468 The output sequence should be: 5469 5470 1st vec: 0 4 8 12 16 20 24 28 5471 2nd vec: 1 5 9 13 17 21 25 29 5472 3rd vec: 2 6 10 14 18 22 26 30 5473 4th vec: 3 7 11 15 19 23 27 31 5474 5475 i.e., the first output vector should contain the first elements of each 5476 interleaving group, etc. 5477 5478 We use extract_even/odd instructions to create such output. The input of 5479 each extract_even/odd operation is two vectors 5480 1st vec 2nd vec 5481 0 1 2 3 4 5 6 7 5482 5483 and the output is the vector of extracted even/odd elements. The output of 5484 extract_even will be: 0 2 4 6 5485 and of extract_odd: 1 3 5 7 5486 5487 5488 The permutation is done in log LENGTH stages. In each stage extract_even 5489 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5490 their order. In our example, 5491 5492 E1: extract_even (1st vec, 2nd vec) 5493 E2: extract_odd (1st vec, 2nd vec) 5494 E3: extract_even (3rd vec, 4th vec) 5495 E4: extract_odd (3rd vec, 4th vec) 5496 5497 The output for the first stage will be: 5498 5499 E1: 0 2 4 6 8 10 12 14 5500 E2: 1 3 5 7 9 11 13 15 5501 E3: 16 18 20 22 24 26 28 30 5502 E4: 17 19 21 23 25 27 29 31 5503 5504 In order to proceed and create the correct sequence for the next stage (or 5505 for the correct output, if the second stage is the last one, as in our 5506 example), we first put the output of extract_even operation and then the 5507 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5508 The input for the second stage is: 5509 5510 1st vec (E1): 0 2 4 6 8 10 12 14 5511 2nd vec (E3): 16 18 20 22 24 26 28 30 5512 3rd vec (E2): 1 3 5 7 9 11 13 15 5513 4th vec (E4): 17 19 21 23 25 27 29 31 5514 5515 The output of the second stage: 5516 5517 E1: 0 4 8 12 16 20 24 28 5518 E2: 2 6 10 14 18 22 26 30 5519 E3: 1 5 9 13 17 21 25 29 5520 E4: 3 7 11 15 19 23 27 31 5521 5522 And RESULT_CHAIN after reordering: 5523 5524 1st vec (E1): 0 4 8 12 16 20 24 28 5525 2nd vec (E3): 1 5 9 13 17 21 25 29 5526 3rd vec (E2): 2 6 10 14 18 22 26 30 5527 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5528 5529 static void 5530 vect_permute_load_chain (vec<tree> dr_chain, 5531 unsigned int length, 5532 gimple *stmt, 5533 gimple_stmt_iterator *gsi, 5534 vec<tree> *result_chain) 5535 { 5536 tree data_ref, first_vect, second_vect; 5537 tree perm_mask_even, perm_mask_odd; 5538 tree perm3_mask_low, perm3_mask_high; 5539 gimple *perm_stmt; 5540 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5541 unsigned int i, j, log_length = exact_log2 (length); 5542 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5543 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5544 5545 result_chain->quick_grow (length); 5546 memcpy (result_chain->address (), dr_chain.address (), 5547 length * sizeof (tree)); 5548 5549 if (length == 3) 5550 { 5551 unsigned int k; 5552 5553 for (k = 0; k < 3; k++) 5554 { 5555 for (i = 0; i < nelt; i++) 5556 if (3 * i + k < 2 * nelt) 5557 sel[i] = 3 * i + k; 5558 else 5559 sel[i] = 0; 5560 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); 5561 5562 for (i = 0, j = 0; i < nelt; i++) 5563 if (3 * i + k < 2 * nelt) 5564 sel[i] = i; 5565 else 5566 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5567 5568 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); 5569 5570 first_vect = dr_chain[0]; 5571 second_vect = dr_chain[1]; 5572 5573 /* Create interleaving stmt (low part of): 5574 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5575 ...}> */ 5576 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5577 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5578 second_vect, perm3_mask_low); 5579 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5580 5581 /* Create interleaving stmt (high part of): 5582 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5583 ...}> */ 5584 first_vect = data_ref; 5585 second_vect = dr_chain[2]; 5586 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5587 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5588 second_vect, perm3_mask_high); 5589 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5590 (*result_chain)[k] = data_ref; 5591 } 5592 } 5593 else 5594 { 5595 /* If length is not equal to 3 then only power of 2 is supported. */ 5596 gcc_assert (pow2p_hwi (length)); 5597 5598 for (i = 0; i < nelt; ++i) 5599 sel[i] = i * 2; 5600 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); 5601 5602 for (i = 0; i < nelt; ++i) 5603 sel[i] = i * 2 + 1; 5604 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); 5605 5606 for (i = 0; i < log_length; i++) 5607 { 5608 for (j = 0; j < length; j += 2) 5609 { 5610 first_vect = dr_chain[j]; 5611 second_vect = dr_chain[j+1]; 5612 5613 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 5614 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 5615 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5616 first_vect, second_vect, 5617 perm_mask_even); 5618 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5619 (*result_chain)[j/2] = data_ref; 5620 5621 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 5622 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 5623 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5624 first_vect, second_vect, 5625 perm_mask_odd); 5626 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5627 (*result_chain)[j/2+length/2] = data_ref; 5628 } 5629 memcpy (dr_chain.address (), result_chain->address (), 5630 length * sizeof (tree)); 5631 } 5632 } 5633 } 5634 5635 /* Function vect_shift_permute_load_chain. 5636 5637 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 5638 sequence of stmts to reorder the input data accordingly. 5639 Return the final references for loads in RESULT_CHAIN. 5640 Return true if successed, false otherwise. 5641 5642 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 5643 The input is 3 vectors each containing 8 elements. We assign a 5644 number to each element, the input sequence is: 5645 5646 1st vec: 0 1 2 3 4 5 6 7 5647 2nd vec: 8 9 10 11 12 13 14 15 5648 3rd vec: 16 17 18 19 20 21 22 23 5649 5650 The output sequence should be: 5651 5652 1st vec: 0 3 6 9 12 15 18 21 5653 2nd vec: 1 4 7 10 13 16 19 22 5654 3rd vec: 2 5 8 11 14 17 20 23 5655 5656 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 5657 5658 First we shuffle all 3 vectors to get correct elements order: 5659 5660 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 5661 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 5662 3rd vec: (16 19 22) (17 20 23) (18 21) 5663 5664 Next we unite and shift vector 3 times: 5665 5666 1st step: 5667 shift right by 6 the concatenation of: 5668 "1st vec" and "2nd vec" 5669 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 5670 "2nd vec" and "3rd vec" 5671 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 5672 "3rd vec" and "1st vec" 5673 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 5674 | New vectors | 5675 5676 So that now new vectors are: 5677 5678 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 5679 2nd vec: (10 13) (16 19 22) (17 20 23) 5680 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 5681 5682 2nd step: 5683 shift right by 5 the concatenation of: 5684 "1st vec" and "3rd vec" 5685 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 5686 "2nd vec" and "1st vec" 5687 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 5688 "3rd vec" and "2nd vec" 5689 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 5690 | New vectors | 5691 5692 So that now new vectors are: 5693 5694 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 5695 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 5696 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 5697 5698 3rd step: 5699 shift right by 5 the concatenation of: 5700 "1st vec" and "1st vec" 5701 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 5702 shift right by 3 the concatenation of: 5703 "2nd vec" and "2nd vec" 5704 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 5705 | New vectors | 5706 5707 So that now all vectors are READY: 5708 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 5709 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 5710 3rd vec: ( 1 4 7) (10 13) (16 19 22) 5711 5712 This algorithm is faster than one in vect_permute_load_chain if: 5713 1. "shift of a concatination" is faster than general permutation. 5714 This is usually so. 5715 2. The TARGET machine can't execute vector instructions in parallel. 5716 This is because each step of the algorithm depends on previous. 5717 The algorithm in vect_permute_load_chain is much more parallel. 5718 5719 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 5720 */ 5721 5722 static bool 5723 vect_shift_permute_load_chain (vec<tree> dr_chain, 5724 unsigned int length, 5725 gimple *stmt, 5726 gimple_stmt_iterator *gsi, 5727 vec<tree> *result_chain) 5728 { 5729 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 5730 tree perm2_mask1, perm2_mask2, perm3_mask; 5731 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 5732 gimple *perm_stmt; 5733 5734 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 5735 unsigned int i; 5736 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 5737 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 5738 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 5739 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5740 5741 result_chain->quick_grow (length); 5742 memcpy (result_chain->address (), dr_chain.address (), 5743 length * sizeof (tree)); 5744 5745 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) 5746 { 5747 unsigned int j, log_length = exact_log2 (length); 5748 for (i = 0; i < nelt / 2; ++i) 5749 sel[i] = i * 2; 5750 for (i = 0; i < nelt / 2; ++i) 5751 sel[nelt / 2 + i] = i * 2 + 1; 5752 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5753 { 5754 if (dump_enabled_p ()) 5755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5756 "shuffle of 2 fields structure is not \ 5757 supported by target\n"); 5758 return false; 5759 } 5760 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); 5761 5762 for (i = 0; i < nelt / 2; ++i) 5763 sel[i] = i * 2 + 1; 5764 for (i = 0; i < nelt / 2; ++i) 5765 sel[nelt / 2 + i] = i * 2; 5766 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5767 { 5768 if (dump_enabled_p ()) 5769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5770 "shuffle of 2 fields structure is not \ 5771 supported by target\n"); 5772 return false; 5773 } 5774 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); 5775 5776 /* Generating permutation constant to shift all elements. 5777 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 5778 for (i = 0; i < nelt; i++) 5779 sel[i] = nelt / 2 + i; 5780 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5781 { 5782 if (dump_enabled_p ()) 5783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5784 "shift permutation is not supported by target\n"); 5785 return false; 5786 } 5787 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5788 5789 /* Generating permutation constant to select vector from 2. 5790 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 5791 for (i = 0; i < nelt / 2; i++) 5792 sel[i] = i; 5793 for (i = nelt / 2; i < nelt; i++) 5794 sel[i] = nelt + i; 5795 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5796 { 5797 if (dump_enabled_p ()) 5798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5799 "select is not supported by target\n"); 5800 return false; 5801 } 5802 select_mask = vect_gen_perm_mask_checked (vectype, sel); 5803 5804 for (i = 0; i < log_length; i++) 5805 { 5806 for (j = 0; j < length; j += 2) 5807 { 5808 first_vect = dr_chain[j]; 5809 second_vect = dr_chain[j + 1]; 5810 5811 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5812 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5813 first_vect, first_vect, 5814 perm2_mask1); 5815 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5816 vect[0] = data_ref; 5817 5818 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 5819 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5820 second_vect, second_vect, 5821 perm2_mask2); 5822 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5823 vect[1] = data_ref; 5824 5825 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 5826 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5827 vect[0], vect[1], shift1_mask); 5828 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5829 (*result_chain)[j/2 + length/2] = data_ref; 5830 5831 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 5832 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5833 vect[0], vect[1], select_mask); 5834 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5835 (*result_chain)[j/2] = data_ref; 5836 } 5837 memcpy (dr_chain.address (), result_chain->address (), 5838 length * sizeof (tree)); 5839 } 5840 return true; 5841 } 5842 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) 5843 { 5844 unsigned int k = 0, l = 0; 5845 5846 /* Generating permutation constant to get all elements in rigth order. 5847 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 5848 for (i = 0; i < nelt; i++) 5849 { 5850 if (3 * k + (l % 3) >= nelt) 5851 { 5852 k = 0; 5853 l += (3 - (nelt % 3)); 5854 } 5855 sel[i] = 3 * k + (l % 3); 5856 k++; 5857 } 5858 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5859 { 5860 if (dump_enabled_p ()) 5861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5862 "shuffle of 3 fields structure is not \ 5863 supported by target\n"); 5864 return false; 5865 } 5866 perm3_mask = vect_gen_perm_mask_checked (vectype, sel); 5867 5868 /* Generating permutation constant to shift all elements. 5869 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 5870 for (i = 0; i < nelt; i++) 5871 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 5872 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5873 { 5874 if (dump_enabled_p ()) 5875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5876 "shift permutation is not supported by target\n"); 5877 return false; 5878 } 5879 shift1_mask = vect_gen_perm_mask_checked (vectype, sel); 5880 5881 /* Generating permutation constant to shift all elements. 5882 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5883 for (i = 0; i < nelt; i++) 5884 sel[i] = 2 * (nelt / 3) + 1 + i; 5885 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5886 { 5887 if (dump_enabled_p ()) 5888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5889 "shift permutation is not supported by target\n"); 5890 return false; 5891 } 5892 shift2_mask = vect_gen_perm_mask_checked (vectype, sel); 5893 5894 /* Generating permutation constant to shift all elements. 5895 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 5896 for (i = 0; i < nelt; i++) 5897 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 5898 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5899 { 5900 if (dump_enabled_p ()) 5901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5902 "shift permutation is not supported by target\n"); 5903 return false; 5904 } 5905 shift3_mask = vect_gen_perm_mask_checked (vectype, sel); 5906 5907 /* Generating permutation constant to shift all elements. 5908 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 5909 for (i = 0; i < nelt; i++) 5910 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 5911 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) 5912 { 5913 if (dump_enabled_p ()) 5914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5915 "shift permutation is not supported by target\n"); 5916 return false; 5917 } 5918 shift4_mask = vect_gen_perm_mask_checked (vectype, sel); 5919 5920 for (k = 0; k < 3; k++) 5921 { 5922 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 5923 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5924 dr_chain[k], dr_chain[k], 5925 perm3_mask); 5926 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5927 vect[k] = data_ref; 5928 } 5929 5930 for (k = 0; k < 3; k++) 5931 { 5932 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 5933 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5934 vect[k % 3], vect[(k + 1) % 3], 5935 shift1_mask); 5936 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5937 vect_shift[k] = data_ref; 5938 } 5939 5940 for (k = 0; k < 3; k++) 5941 { 5942 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 5943 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 5944 vect_shift[(4 - k) % 3], 5945 vect_shift[(3 - k) % 3], 5946 shift2_mask); 5947 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5948 vect[k] = data_ref; 5949 } 5950 5951 (*result_chain)[3 - (nelt % 3)] = vect[2]; 5952 5953 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 5954 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 5955 vect[0], shift3_mask); 5956 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5957 (*result_chain)[nelt % 3] = data_ref; 5958 5959 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 5960 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 5961 vect[1], shift4_mask); 5962 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 5963 (*result_chain)[0] = data_ref; 5964 return true; 5965 } 5966 return false; 5967 } 5968 5969 /* Function vect_transform_grouped_load. 5970 5971 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 5972 to perform their permutation and ascribe the result vectorized statements to 5973 the scalar statements. 5974 */ 5975 5976 void 5977 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, 5978 gimple_stmt_iterator *gsi) 5979 { 5980 machine_mode mode; 5981 vec<tree> result_chain = vNULL; 5982 5983 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 5984 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 5985 vectors, that are ready for vector computation. */ 5986 result_chain.create (size); 5987 5988 /* If reassociation width for vector type is 2 or greater target machine can 5989 execute 2 or more vector instructions in parallel. Otherwise try to 5990 get chain for loads group using vect_shift_permute_load_chain. */ 5991 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); 5992 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 5993 || pow2p_hwi (size) 5994 || !vect_shift_permute_load_chain (dr_chain, size, stmt, 5995 gsi, &result_chain)) 5996 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 5997 vect_record_grouped_load_vectors (stmt, result_chain); 5998 result_chain.release (); 5999 } 6000 6001 /* RESULT_CHAIN contains the output of a group of grouped loads that were 6002 generated as part of the vectorization of STMT. Assign the statement 6003 for each vector to the associated scalar statement. */ 6004 6005 void 6006 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) 6007 { 6008 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 6009 gimple *next_stmt, *new_stmt; 6010 unsigned int i, gap_count; 6011 tree tmp_data_ref; 6012 6013 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6014 Since we scan the chain starting from it's first node, their order 6015 corresponds the order of data-refs in RESULT_CHAIN. */ 6016 next_stmt = first_stmt; 6017 gap_count = 1; 6018 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6019 { 6020 if (!next_stmt) 6021 break; 6022 6023 /* Skip the gaps. Loads created for the gaps will be removed by dead 6024 code elimination pass later. No need to check for the first stmt in 6025 the group, since it always exists. 6026 GROUP_GAP is the number of steps in elements from the previous 6027 access (if there is no gap GROUP_GAP is 1). We skip loads that 6028 correspond to the gaps. */ 6029 if (next_stmt != first_stmt 6030 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 6031 { 6032 gap_count++; 6033 continue; 6034 } 6035 6036 while (next_stmt) 6037 { 6038 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 6039 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6040 copies, and we put the new vector statement in the first available 6041 RELATED_STMT. */ 6042 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 6043 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 6044 else 6045 { 6046 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6047 { 6048 gimple *prev_stmt = 6049 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 6050 gimple *rel_stmt = 6051 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 6052 while (rel_stmt) 6053 { 6054 prev_stmt = rel_stmt; 6055 rel_stmt = 6056 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 6057 } 6058 6059 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 6060 new_stmt; 6061 } 6062 } 6063 6064 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 6065 gap_count = 1; 6066 /* If NEXT_STMT accesses the same DR as the previous statement, 6067 put the same TMP_DATA_REF as its vectorized statement; otherwise 6068 get the next data-ref from RESULT_CHAIN. */ 6069 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 6070 break; 6071 } 6072 } 6073 } 6074 6075 /* Function vect_force_dr_alignment_p. 6076 6077 Returns whether the alignment of a DECL can be forced to be aligned 6078 on ALIGNMENT bit boundary. */ 6079 6080 bool 6081 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 6082 { 6083 if (!VAR_P (decl)) 6084 return false; 6085 6086 if (decl_in_symtab_p (decl) 6087 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6088 return false; 6089 6090 if (TREE_STATIC (decl)) 6091 return (alignment <= MAX_OFILE_ALIGNMENT); 6092 else 6093 return (alignment <= MAX_STACK_ALIGNMENT); 6094 } 6095 6096 6097 /* Return whether the data reference DR is supported with respect to its 6098 alignment. 6099 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6100 it is aligned, i.e., check if it is possible to vectorize it with different 6101 alignment. */ 6102 6103 enum dr_alignment_support 6104 vect_supportable_dr_alignment (struct data_reference *dr, 6105 bool check_aligned_accesses) 6106 { 6107 gimple *stmt = DR_STMT (dr); 6108 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 6109 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6110 machine_mode mode = TYPE_MODE (vectype); 6111 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6112 struct loop *vect_loop = NULL; 6113 bool nested_in_vect_loop = false; 6114 6115 if (aligned_access_p (dr) && !check_aligned_accesses) 6116 return dr_aligned; 6117 6118 /* For now assume all conditional loads/stores support unaligned 6119 access without any special code. */ 6120 if (is_gimple_call (stmt) 6121 && gimple_call_internal_p (stmt) 6122 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6123 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6124 return dr_unaligned_supported; 6125 6126 if (loop_vinfo) 6127 { 6128 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6129 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 6130 } 6131 6132 /* Possibly unaligned access. */ 6133 6134 /* We can choose between using the implicit realignment scheme (generating 6135 a misaligned_move stmt) and the explicit realignment scheme (generating 6136 aligned loads with a REALIGN_LOAD). There are two variants to the 6137 explicit realignment scheme: optimized, and unoptimized. 6138 We can optimize the realignment only if the step between consecutive 6139 vector loads is equal to the vector size. Since the vector memory 6140 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6141 is guaranteed that the misalignment amount remains the same throughout the 6142 execution of the vectorized loop. Therefore, we can create the 6143 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6144 at the loop preheader. 6145 6146 However, in the case of outer-loop vectorization, when vectorizing a 6147 memory access in the inner-loop nested within the LOOP that is now being 6148 vectorized, while it is guaranteed that the misalignment of the 6149 vectorized memory access will remain the same in different outer-loop 6150 iterations, it is *not* guaranteed that is will remain the same throughout 6151 the execution of the inner-loop. This is because the inner-loop advances 6152 with the original scalar step (and not in steps of VS). If the inner-loop 6153 step happens to be a multiple of VS, then the misalignment remains fixed 6154 and we can use the optimized realignment scheme. For example: 6155 6156 for (i=0; i<N; i++) 6157 for (j=0; j<M; j++) 6158 s += a[i+j]; 6159 6160 When vectorizing the i-loop in the above example, the step between 6161 consecutive vector loads is 1, and so the misalignment does not remain 6162 fixed across the execution of the inner-loop, and the realignment cannot 6163 be optimized (as illustrated in the following pseudo vectorized loop): 6164 6165 for (i=0; i<N; i+=4) 6166 for (j=0; j<M; j++){ 6167 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6168 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6169 // (assuming that we start from an aligned address). 6170 } 6171 6172 We therefore have to use the unoptimized realignment scheme: 6173 6174 for (i=0; i<N; i+=4) 6175 for (j=k; j<M; j+=4) 6176 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6177 // that the misalignment of the initial address is 6178 // 0). 6179 6180 The loop can then be vectorized as follows: 6181 6182 for (k=0; k<4; k++){ 6183 rt = get_realignment_token (&vp[k]); 6184 for (i=0; i<N; i+=4){ 6185 v1 = vp[i+k]; 6186 for (j=k; j<M; j+=4){ 6187 v2 = vp[i+j+VS-1]; 6188 va = REALIGN_LOAD <v1,v2,rt>; 6189 vs += va; 6190 v1 = v2; 6191 } 6192 } 6193 } */ 6194 6195 if (DR_IS_READ (dr)) 6196 { 6197 bool is_packed = false; 6198 tree type = (TREE_TYPE (DR_REF (dr))); 6199 6200 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6201 && (!targetm.vectorize.builtin_mask_for_load 6202 || targetm.vectorize.builtin_mask_for_load ())) 6203 { 6204 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6205 6206 /* If we are doing SLP then the accesses need not have the 6207 same alignment, instead it depends on the SLP group size. */ 6208 if (loop_vinfo 6209 && STMT_SLP_TYPE (stmt_info) 6210 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6211 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))) 6212 % TYPE_VECTOR_SUBPARTS (vectype) != 0)) 6213 ; 6214 else if (!loop_vinfo 6215 || (nested_in_vect_loop 6216 && (TREE_INT_CST_LOW (DR_STEP (dr)) 6217 != GET_MODE_SIZE (TYPE_MODE (vectype))))) 6218 return dr_explicit_realign; 6219 else 6220 return dr_explicit_realign_optimized; 6221 } 6222 if (!known_alignment_for_access_p (dr)) 6223 is_packed = not_size_aligned (DR_REF (dr)); 6224 6225 if (targetm.vectorize.support_vector_misalignment 6226 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6227 /* Can't software pipeline the loads, but can at least do them. */ 6228 return dr_unaligned_supported; 6229 } 6230 else 6231 { 6232 bool is_packed = false; 6233 tree type = (TREE_TYPE (DR_REF (dr))); 6234 6235 if (!known_alignment_for_access_p (dr)) 6236 is_packed = not_size_aligned (DR_REF (dr)); 6237 6238 if (targetm.vectorize.support_vector_misalignment 6239 (mode, type, DR_MISALIGNMENT (dr), is_packed)) 6240 return dr_unaligned_supported; 6241 } 6242 6243 /* Unsupported. */ 6244 return dr_unaligned_unsupported; 6245 } 6246