1 /* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2013 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "dumpfile.h" 26 #include "tm.h" 27 #include "ggc.h" 28 #include "tree.h" 29 #include "tm_p.h" 30 #include "target.h" 31 #include "basic-block.h" 32 #include "gimple-pretty-print.h" 33 #include "tree-flow.h" 34 #include "dumpfile.h" 35 #include "cfgloop.h" 36 #include "tree-chrec.h" 37 #include "tree-scalar-evolution.h" 38 #include "tree-vectorizer.h" 39 #include "diagnostic-core.h" 40 41 /* Need to include rtl.h, expr.h, etc. for optabs. */ 42 #include "expr.h" 43 #include "optabs.h" 44 45 /* Return true if load- or store-lanes optab OPTAB is implemented for 46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 47 48 static bool 49 vect_lanes_optab_supported_p (const char *name, convert_optab optab, 50 tree vectype, unsigned HOST_WIDE_INT count) 51 { 52 enum machine_mode mode, array_mode; 53 bool limit_p; 54 55 mode = TYPE_MODE (vectype); 56 limit_p = !targetm.array_mode_supported_p (mode, count); 57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), 58 MODE_INT, limit_p); 59 60 if (array_mode == BLKmode) 61 { 62 if (dump_enabled_p ()) 63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]", 65 GET_MODE_NAME (mode), count); 66 return false; 67 } 68 69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 70 { 71 if (dump_enabled_p ()) 72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 73 "cannot use %s<%s><%s>", name, 74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 75 return false; 76 } 77 78 if (dump_enabled_p ()) 79 dump_printf_loc (MSG_NOTE, vect_location, 80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode), 81 GET_MODE_NAME (mode)); 82 83 return true; 84 } 85 86 87 /* Return the smallest scalar part of STMT. 88 This is used to determine the vectype of the stmt. We generally set the 89 vectype according to the type of the result (lhs). For stmts whose 90 result-type is different than the type of the arguments (e.g., demotion, 91 promotion), vectype will be reset appropriately (later). Note that we have 92 to visit the smallest datatype in this function, because that determines the 93 VF. If the smallest datatype in the loop is present only as the rhs of a 94 promotion operation - we'd miss it. 95 Such a case, where a variable of this datatype does not appear in the lhs 96 anywhere in the loop, can only occur if it's an invariant: e.g.: 97 'int_x = (int) short_inv', which we'd expect to have been optimized away by 98 invariant motion. However, we cannot rely on invariant motion to always 99 take invariants out of the loop, and so in the case of promotion we also 100 have to check the rhs. 101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 102 types. */ 103 104 tree 105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit, 106 HOST_WIDE_INT *rhs_size_unit) 107 { 108 tree scalar_type = gimple_expr_type (stmt); 109 HOST_WIDE_INT lhs, rhs; 110 111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 112 113 if (is_gimple_assign (stmt) 114 && (gimple_assign_cast_p (stmt) 115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR 116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR 117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) 118 { 119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 120 121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 122 if (rhs < lhs) 123 scalar_type = rhs_type; 124 } 125 126 *lhs_size_unit = lhs; 127 *rhs_size_unit = rhs; 128 return scalar_type; 129 } 130 131 132 /* Find the place of the data-ref in STMT in the interleaving chain that starts 133 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */ 134 135 int 136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt) 137 { 138 gimple next_stmt = first_stmt; 139 int result = 0; 140 141 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 142 return -1; 143 144 while (next_stmt && next_stmt != stmt) 145 { 146 result++; 147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 148 } 149 150 if (next_stmt) 151 return result; 152 else 153 return -1; 154 } 155 156 157 /* Function vect_insert_into_interleaving_chain. 158 159 Insert DRA into the interleaving chain of DRB according to DRA's INIT. */ 160 161 static void 162 vect_insert_into_interleaving_chain (struct data_reference *dra, 163 struct data_reference *drb) 164 { 165 gimple prev, next; 166 tree next_init; 167 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 168 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 169 170 prev = GROUP_FIRST_ELEMENT (stmtinfo_b); 171 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)); 172 while (next) 173 { 174 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next))); 175 if (tree_int_cst_compare (next_init, DR_INIT (dra)) > 0) 176 { 177 /* Insert here. */ 178 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra); 179 GROUP_NEXT_ELEMENT (stmtinfo_a) = next; 180 return; 181 } 182 prev = next; 183 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)); 184 } 185 186 /* We got to the end of the list. Insert here. */ 187 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra); 188 GROUP_NEXT_ELEMENT (stmtinfo_a) = NULL; 189 } 190 191 192 /* Function vect_update_interleaving_chain. 193 194 For two data-refs DRA and DRB that are a part of a chain interleaved data 195 accesses, update the interleaving chain. DRB's INIT is smaller than DRA's. 196 197 There are four possible cases: 198 1. New stmts - both DRA and DRB are not a part of any chain: 199 FIRST_DR = DRB 200 NEXT_DR (DRB) = DRA 201 2. DRB is a part of a chain and DRA is not: 202 no need to update FIRST_DR 203 no need to insert DRB 204 insert DRA according to init 205 3. DRA is a part of a chain and DRB is not: 206 if (init of FIRST_DR > init of DRB) 207 FIRST_DR = DRB 208 NEXT(FIRST_DR) = previous FIRST_DR 209 else 210 insert DRB according to its init 211 4. both DRA and DRB are in some interleaving chains: 212 choose the chain with the smallest init of FIRST_DR 213 insert the nodes of the second chain into the first one. */ 214 215 static void 216 vect_update_interleaving_chain (struct data_reference *drb, 217 struct data_reference *dra) 218 { 219 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 220 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 221 tree next_init, init_dra_chain, init_drb_chain; 222 gimple first_a, first_b; 223 tree node_init; 224 gimple node, prev, next, first_stmt; 225 226 /* 1. New stmts - both DRA and DRB are not a part of any chain. */ 227 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b)) 228 { 229 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (drb); 230 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb); 231 GROUP_NEXT_ELEMENT (stmtinfo_b) = DR_STMT (dra); 232 return; 233 } 234 235 /* 2. DRB is a part of a chain and DRA is not. */ 236 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && GROUP_FIRST_ELEMENT (stmtinfo_b)) 237 { 238 GROUP_FIRST_ELEMENT (stmtinfo_a) = GROUP_FIRST_ELEMENT (stmtinfo_b); 239 /* Insert DRA into the chain of DRB. */ 240 vect_insert_into_interleaving_chain (dra, drb); 241 return; 242 } 243 244 /* 3. DRA is a part of a chain and DRB is not. */ 245 if (GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b)) 246 { 247 gimple old_first_stmt = GROUP_FIRST_ELEMENT (stmtinfo_a); 248 tree init_old = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt ( 249 old_first_stmt))); 250 gimple tmp; 251 252 if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0) 253 { 254 /* DRB's init is smaller than the init of the stmt previously marked 255 as the first stmt of the interleaving chain of DRA. Therefore, we 256 update FIRST_STMT and put DRB in the head of the list. */ 257 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb); 258 GROUP_NEXT_ELEMENT (stmtinfo_b) = old_first_stmt; 259 260 /* Update all the stmts in the list to point to the new FIRST_STMT. */ 261 tmp = old_first_stmt; 262 while (tmp) 263 { 264 GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) = DR_STMT (drb); 265 tmp = GROUP_NEXT_ELEMENT (vinfo_for_stmt (tmp)); 266 } 267 } 268 else 269 { 270 /* Insert DRB in the list of DRA. */ 271 vect_insert_into_interleaving_chain (drb, dra); 272 GROUP_FIRST_ELEMENT (stmtinfo_b) = GROUP_FIRST_ELEMENT (stmtinfo_a); 273 } 274 return; 275 } 276 277 /* 4. both DRA and DRB are in some interleaving chains. */ 278 first_a = GROUP_FIRST_ELEMENT (stmtinfo_a); 279 first_b = GROUP_FIRST_ELEMENT (stmtinfo_b); 280 if (first_a == first_b) 281 return; 282 init_dra_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_a))); 283 init_drb_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_b))); 284 285 if (tree_int_cst_compare (init_dra_chain, init_drb_chain) > 0) 286 { 287 /* Insert the nodes of DRA chain into the DRB chain. 288 After inserting a node, continue from this node of the DRB chain (don't 289 start from the beginning. */ 290 node = GROUP_FIRST_ELEMENT (stmtinfo_a); 291 prev = GROUP_FIRST_ELEMENT (stmtinfo_b); 292 first_stmt = first_b; 293 } 294 else 295 { 296 /* Insert the nodes of DRB chain into the DRA chain. 297 After inserting a node, continue from this node of the DRA chain (don't 298 start from the beginning. */ 299 node = GROUP_FIRST_ELEMENT (stmtinfo_b); 300 prev = GROUP_FIRST_ELEMENT (stmtinfo_a); 301 first_stmt = first_a; 302 } 303 304 while (node) 305 { 306 node_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (node))); 307 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)); 308 while (next) 309 { 310 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next))); 311 if (tree_int_cst_compare (next_init, node_init) > 0) 312 { 313 /* Insert here. */ 314 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node; 315 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = next; 316 prev = node; 317 break; 318 } 319 prev = next; 320 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)); 321 } 322 if (!next) 323 { 324 /* We got to the end of the list. Insert here. */ 325 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node; 326 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = NULL; 327 prev = node; 328 } 329 GROUP_FIRST_ELEMENT (vinfo_for_stmt (node)) = first_stmt; 330 node = GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)); 331 } 332 } 333 334 /* Check dependence between DRA and DRB for basic block vectorization. 335 If the accesses share same bases and offsets, we can compare their initial 336 constant offsets to decide whether they differ or not. In case of a read- 337 write dependence we check that the load is before the store to ensure that 338 vectorization will not change the order of the accesses. */ 339 340 static bool 341 vect_drs_dependent_in_basic_block (struct data_reference *dra, 342 struct data_reference *drb) 343 { 344 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b; 345 gimple earlier_stmt; 346 347 /* We only call this function for pairs of loads and stores, but we verify 348 it here. */ 349 if (DR_IS_READ (dra) == DR_IS_READ (drb)) 350 { 351 if (DR_IS_READ (dra)) 352 return false; 353 else 354 return true; 355 } 356 357 /* Check that the data-refs have same bases and offsets. If not, we can't 358 determine if they are dependent. */ 359 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 360 || !dr_equal_offsets_p (dra, drb)) 361 return true; 362 363 /* Check the types. */ 364 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)))); 365 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 366 367 if (type_size_a != type_size_b 368 || !types_compatible_p (TREE_TYPE (DR_REF (dra)), 369 TREE_TYPE (DR_REF (drb)))) 370 return true; 371 372 init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 373 init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 374 375 /* Two different locations - no dependence. */ 376 if (init_a != init_b) 377 return false; 378 379 /* We have a read-write dependence. Check that the load is before the store. 380 When we vectorize basic blocks, vector load can be only before 381 corresponding scalar load, and vector store can be only after its 382 corresponding scalar store. So the order of the acceses is preserved in 383 case the load is before the store. */ 384 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); 385 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) 386 return false; 387 388 return true; 389 } 390 391 392 /* Function vect_check_interleaving. 393 394 Check if DRA and DRB are a part of interleaving. In case they are, insert 395 DRA and DRB in an interleaving chain. */ 396 397 static bool 398 vect_check_interleaving (struct data_reference *dra, 399 struct data_reference *drb) 400 { 401 HOST_WIDE_INT type_size_a, type_size_b, diff_mod_size, step, init_a, init_b; 402 403 /* Check that the data-refs have same first location (except init) and they 404 are both either store or load (not load and store). */ 405 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 406 || !dr_equal_offsets_p (dra, drb) 407 || !tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) 408 || DR_IS_READ (dra) != DR_IS_READ (drb)) 409 return false; 410 411 /* Check: 412 1. data-refs are of the same type 413 2. their steps are equal 414 3. the step (if greater than zero) is greater than the difference between 415 data-refs' inits. */ 416 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)))); 417 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 418 419 if (type_size_a != type_size_b 420 || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb)) 421 || !types_compatible_p (TREE_TYPE (DR_REF (dra)), 422 TREE_TYPE (DR_REF (drb)))) 423 return false; 424 425 init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 426 init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 427 step = TREE_INT_CST_LOW (DR_STEP (dra)); 428 429 if (init_a > init_b) 430 { 431 /* If init_a == init_b + the size of the type * k, we have an interleaving, 432 and DRB is accessed before DRA. */ 433 diff_mod_size = (init_a - init_b) % type_size_a; 434 435 if (step && (init_a - init_b) > step) 436 return false; 437 438 if (diff_mod_size == 0) 439 { 440 vect_update_interleaving_chain (drb, dra); 441 if (dump_enabled_p ()) 442 { 443 dump_printf_loc (MSG_NOTE, vect_location, 444 "Detected interleaving "); 445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 446 dump_printf (MSG_NOTE, " and "); 447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 448 } 449 return true; 450 } 451 } 452 else 453 { 454 /* If init_b == init_a + the size of the type * k, we have an 455 interleaving, and DRA is accessed before DRB. */ 456 diff_mod_size = (init_b - init_a) % type_size_a; 457 458 if (step && (init_b - init_a) > step) 459 return false; 460 461 if (diff_mod_size == 0) 462 { 463 vect_update_interleaving_chain (dra, drb); 464 if (dump_enabled_p ()) 465 { 466 dump_printf_loc (MSG_NOTE, vect_location, 467 "Detected interleaving "); 468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 469 dump_printf (MSG_NOTE, " and "); 470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 471 } 472 return true; 473 } 474 } 475 476 return false; 477 } 478 479 /* Check if data references pointed by DR_I and DR_J are same or 480 belong to same interleaving group. Return FALSE if drs are 481 different, otherwise return TRUE. */ 482 483 static bool 484 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j) 485 { 486 gimple stmt_i = DR_STMT (dr_i); 487 gimple stmt_j = DR_STMT (dr_j); 488 489 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0) 490 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i)) 491 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)) 492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i)) 493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))))) 494 return true; 495 else 496 return false; 497 } 498 499 /* If address ranges represented by DDR_I and DDR_J are equal, 500 return TRUE, otherwise return FALSE. */ 501 502 static bool 503 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j) 504 { 505 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j)) 506 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j))) 507 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j)) 508 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j)))) 509 return true; 510 else 511 return false; 512 } 513 514 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 515 tested at run-time. Return TRUE if DDR was successfully inserted. 516 Return false if versioning is not supported. */ 517 518 static bool 519 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 520 { 521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 522 523 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) 524 return false; 525 526 if (dump_enabled_p ()) 527 { 528 dump_printf_loc (MSG_NOTE, vect_location, 529 "mark for run-time aliasing test between "); 530 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); 531 dump_printf (MSG_NOTE, " and "); 532 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); 533 } 534 535 if (optimize_loop_nest_for_size_p (loop)) 536 { 537 if (dump_enabled_p ()) 538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 539 "versioning not supported when optimizing for size."); 540 return false; 541 } 542 543 /* FORNOW: We don't support versioning with outer-loop vectorization. */ 544 if (loop->inner) 545 { 546 if (dump_enabled_p ()) 547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 548 "versioning not yet supported for outer-loops."); 549 return false; 550 } 551 552 /* FORNOW: We don't support creating runtime alias tests for non-constant 553 step. */ 554 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST 555 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) 556 { 557 if (dump_enabled_p ()) 558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 559 "versioning not yet supported for non-constant " 560 "step"); 561 return false; 562 } 563 564 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 565 return true; 566 } 567 568 569 /* Function vect_analyze_data_ref_dependence. 570 571 Return TRUE if there (might) exist a dependence between a memory-reference 572 DRA and a memory-reference DRB. When versioning for alias may check a 573 dependence at run-time, return FALSE. Adjust *MAX_VF according to 574 the data dependence. */ 575 576 static bool 577 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 578 loop_vec_info loop_vinfo, int *max_vf) 579 { 580 unsigned int i; 581 struct loop *loop = NULL; 582 struct data_reference *dra = DDR_A (ddr); 583 struct data_reference *drb = DDR_B (ddr); 584 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 585 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 586 lambda_vector dist_v; 587 unsigned int loop_depth; 588 589 /* Don't bother to analyze statements marked as unvectorizable. */ 590 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 591 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 592 return false; 593 594 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 595 { 596 /* Independent data accesses. */ 597 vect_check_interleaving (dra, drb); 598 return false; 599 } 600 601 if (loop_vinfo) 602 loop = LOOP_VINFO_LOOP (loop_vinfo); 603 604 if ((DR_IS_READ (dra) && DR_IS_READ (drb) && loop_vinfo) || dra == drb) 605 return false; 606 607 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 608 { 609 gimple earlier_stmt; 610 611 if (loop_vinfo) 612 { 613 if (dump_enabled_p ()) 614 { 615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 616 "versioning for alias required: " 617 "can't determine dependence between "); 618 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 619 DR_REF (dra)); 620 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 621 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 622 DR_REF (drb)); 623 } 624 625 /* Add to list of ddrs that need to be tested at run-time. */ 626 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 627 } 628 629 /* When vectorizing a basic block unknown depnedence can still mean 630 grouped access. */ 631 if (vect_check_interleaving (dra, drb)) 632 return false; 633 634 /* Read-read is OK (we need this check here, after checking for 635 interleaving). */ 636 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 637 return false; 638 639 if (dump_enabled_p ()) 640 { 641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 642 "can't determine dependence between "); 643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 644 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 645 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 646 } 647 648 /* We do not vectorize basic blocks with write-write dependencies. */ 649 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb)) 650 return true; 651 652 /* Check that it's not a load-after-store dependence. */ 653 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); 654 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) 655 return true; 656 657 return false; 658 } 659 660 /* Versioning for alias is not yet supported for basic block SLP, and 661 dependence distance is unapplicable, hence, in case of known data 662 dependence, basic block vectorization is impossible for now. */ 663 if (!loop_vinfo) 664 { 665 if (dra != drb && vect_check_interleaving (dra, drb)) 666 return false; 667 668 if (dump_enabled_p ()) 669 { 670 dump_printf_loc (MSG_NOTE, vect_location, 671 "determined dependence between "); 672 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 673 dump_printf (MSG_NOTE, " and "); 674 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 675 } 676 677 /* Do not vectorize basic blcoks with write-write dependences. */ 678 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb)) 679 return true; 680 681 /* Check if this dependence is allowed in basic block vectorization. */ 682 return vect_drs_dependent_in_basic_block (dra, drb); 683 } 684 685 /* Loop-based vectorization and known data dependence. */ 686 if (DDR_NUM_DIST_VECTS (ddr) == 0) 687 { 688 if (dump_enabled_p ()) 689 { 690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 691 "versioning for alias required: " 692 "bad dist vector for "); 693 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); 694 dump_printf (MSG_MISSED_OPTIMIZATION, " and "); 695 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); 696 } 697 /* Add to list of ddrs that need to be tested at run-time. */ 698 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 699 } 700 701 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 702 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 703 { 704 int dist = dist_v[loop_depth]; 705 706 if (dump_enabled_p ()) 707 dump_printf_loc (MSG_NOTE, vect_location, 708 "dependence distance = %d.", dist); 709 710 if (dist == 0) 711 { 712 if (dump_enabled_p ()) 713 { 714 dump_printf_loc (MSG_NOTE, vect_location, 715 "dependence distance == 0 between "); 716 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 717 dump_printf (MSG_NOTE, " and "); 718 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 719 } 720 721 /* For interleaving, mark that there is a read-write dependency if 722 necessary. We check before that one of the data-refs is store. */ 723 if (DR_IS_READ (dra)) 724 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true; 725 else 726 { 727 if (DR_IS_READ (drb)) 728 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true; 729 } 730 731 continue; 732 } 733 734 if (dist > 0 && DDR_REVERSED_P (ddr)) 735 { 736 /* If DDR_REVERSED_P the order of the data-refs in DDR was 737 reversed (to make distance vector positive), and the actual 738 distance is negative. */ 739 if (dump_enabled_p ()) 740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 741 "dependence distance negative."); 742 /* Record a negative dependence distance to later limit the 743 amount of stmt copying / unrolling we can perform. 744 Only need to handle read-after-write dependence. */ 745 if (DR_IS_READ (drb) 746 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 747 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 748 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 749 continue; 750 } 751 752 if (abs (dist) >= 2 753 && abs (dist) < *max_vf) 754 { 755 /* The dependence distance requires reduction of the maximal 756 vectorization factor. */ 757 *max_vf = abs (dist); 758 if (dump_enabled_p ()) 759 dump_printf_loc (MSG_NOTE, vect_location, 760 "adjusting maximal vectorization factor to %i", 761 *max_vf); 762 } 763 764 if (abs (dist) >= *max_vf) 765 { 766 /* Dependence distance does not create dependence, as far as 767 vectorization is concerned, in this case. */ 768 if (dump_enabled_p ()) 769 dump_printf_loc (MSG_NOTE, vect_location, 770 "dependence distance >= VF."); 771 continue; 772 } 773 774 if (dump_enabled_p ()) 775 { 776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 777 "not vectorized, possible dependence " 778 "between data-refs "); 779 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 780 dump_printf (MSG_NOTE, " and "); 781 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 782 } 783 784 return true; 785 } 786 787 return false; 788 } 789 790 /* Function vect_analyze_data_ref_dependences. 791 792 Examine all the data references in the loop, and make sure there do not 793 exist any data dependences between them. Set *MAX_VF according to 794 the maximum vectorization factor the data dependences allow. */ 795 796 bool 797 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 798 bb_vec_info bb_vinfo, int *max_vf) 799 { 800 unsigned int i; 801 vec<ddr_p> ddrs = vNULL; 802 struct data_dependence_relation *ddr; 803 804 if (dump_enabled_p ()) 805 dump_printf_loc (MSG_NOTE, vect_location, 806 "=== vect_analyze_dependences ==="); 807 if (loop_vinfo) 808 ddrs = LOOP_VINFO_DDRS (loop_vinfo); 809 else 810 ddrs = BB_VINFO_DDRS (bb_vinfo); 811 812 FOR_EACH_VEC_ELT (ddrs, i, ddr) 813 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) 814 return false; 815 816 return true; 817 } 818 819 820 /* Function vect_compute_data_ref_alignment 821 822 Compute the misalignment of the data reference DR. 823 824 Output: 825 1. If during the misalignment computation it is found that the data reference 826 cannot be vectorized then false is returned. 827 2. DR_MISALIGNMENT (DR) is defined. 828 829 FOR NOW: No analysis is actually performed. Misalignment is calculated 830 only for trivial cases. TODO. */ 831 832 static bool 833 vect_compute_data_ref_alignment (struct data_reference *dr) 834 { 835 gimple stmt = DR_STMT (dr); 836 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 837 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 838 struct loop *loop = NULL; 839 tree ref = DR_REF (dr); 840 tree vectype; 841 tree base, base_addr; 842 bool base_aligned; 843 tree misalign; 844 tree aligned_to, alignment; 845 846 if (dump_enabled_p ()) 847 dump_printf_loc (MSG_NOTE, vect_location, 848 "vect_compute_data_ref_alignment:"); 849 850 if (loop_vinfo) 851 loop = LOOP_VINFO_LOOP (loop_vinfo); 852 853 /* Initialize misalignment to unknown. */ 854 SET_DR_MISALIGNMENT (dr, -1); 855 856 /* Strided loads perform only component accesses, misalignment information 857 is irrelevant for them. */ 858 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 859 return true; 860 861 misalign = DR_INIT (dr); 862 aligned_to = DR_ALIGNED_TO (dr); 863 base_addr = DR_BASE_ADDRESS (dr); 864 vectype = STMT_VINFO_VECTYPE (stmt_info); 865 866 /* In case the dataref is in an inner-loop of the loop that is being 867 vectorized (LOOP), we use the base and misalignment information 868 relative to the outer-loop (LOOP). This is ok only if the misalignment 869 stays the same throughout the execution of the inner-loop, which is why 870 we have to check that the stride of the dataref in the inner-loop evenly 871 divides by the vector size. */ 872 if (loop && nested_in_vect_loop_p (loop, stmt)) 873 { 874 tree step = DR_STEP (dr); 875 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 876 877 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) 878 { 879 if (dump_enabled_p ()) 880 dump_printf_loc (MSG_NOTE, vect_location, 881 "inner step divides the vector-size."); 882 misalign = STMT_VINFO_DR_INIT (stmt_info); 883 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); 884 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); 885 } 886 else 887 { 888 if (dump_enabled_p ()) 889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 890 "inner step doesn't divide the vector-size."); 891 misalign = NULL_TREE; 892 } 893 } 894 895 /* Similarly, if we're doing basic-block vectorization, we can only use 896 base and misalignment information relative to an innermost loop if the 897 misalignment stays the same throughout the execution of the loop. 898 As above, this is the case if the stride of the dataref evenly divides 899 by the vector size. */ 900 if (!loop) 901 { 902 tree step = DR_STEP (dr); 903 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 904 905 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0) 906 { 907 if (dump_enabled_p ()) 908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 909 "SLP: step doesn't divide the vector-size."); 910 misalign = NULL_TREE; 911 } 912 } 913 914 base = build_fold_indirect_ref (base_addr); 915 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT); 916 917 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0) 918 || !misalign) 919 { 920 if (dump_enabled_p ()) 921 { 922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 923 "Unknown alignment for access: "); 924 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base); 925 } 926 return true; 927 } 928 929 if ((DECL_P (base) 930 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)), 931 alignment) >= 0) 932 || (TREE_CODE (base_addr) == SSA_NAME 933 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE ( 934 TREE_TYPE (base_addr)))), 935 alignment) >= 0) 936 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype))) 937 base_aligned = true; 938 else 939 base_aligned = false; 940 941 if (!base_aligned) 942 { 943 /* Do not change the alignment of global variables here if 944 flag_section_anchors is enabled as we already generated 945 RTL for other functions. Most global variables should 946 have been aligned during the IPA increase_alignment pass. */ 947 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)) 948 || (TREE_STATIC (base) && flag_section_anchors)) 949 { 950 if (dump_enabled_p ()) 951 { 952 dump_printf_loc (MSG_NOTE, vect_location, 953 "can't force alignment of ref: "); 954 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 955 } 956 return true; 957 } 958 959 /* Force the alignment of the decl. 960 NOTE: This is the only change to the code we make during 961 the analysis phase, before deciding to vectorize the loop. */ 962 if (dump_enabled_p ()) 963 { 964 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); 965 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); 966 } 967 968 DECL_ALIGN (base) = TYPE_ALIGN (vectype); 969 DECL_USER_ALIGN (base) = 1; 970 } 971 972 /* At this point we assume that the base is aligned. */ 973 gcc_assert (base_aligned 974 || (TREE_CODE (base) == VAR_DECL 975 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype))); 976 977 /* If this is a backward running DR then first access in the larger 978 vectype actually is N-1 elements before the address in the DR. 979 Adjust misalign accordingly. */ 980 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0) 981 { 982 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); 983 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, 984 otherwise we wouldn't be here. */ 985 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr)); 986 /* PLUS because DR_STEP was negative. */ 987 misalign = size_binop (PLUS_EXPR, misalign, offset); 988 } 989 990 /* Modulo alignment. */ 991 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment); 992 993 if (!host_integerp (misalign, 1)) 994 { 995 /* Negative or overflowed misalignment value. */ 996 if (dump_enabled_p ()) 997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 998 "unexpected misalign value"); 999 return false; 1000 } 1001 1002 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign)); 1003 1004 if (dump_enabled_p ()) 1005 { 1006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1007 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); 1008 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); 1009 } 1010 1011 return true; 1012 } 1013 1014 1015 /* Function vect_compute_data_refs_alignment 1016 1017 Compute the misalignment of data references in the loop. 1018 Return FALSE if a data reference is found that cannot be vectorized. */ 1019 1020 static bool 1021 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo, 1022 bb_vec_info bb_vinfo) 1023 { 1024 vec<data_reference_p> datarefs; 1025 struct data_reference *dr; 1026 unsigned int i; 1027 1028 if (loop_vinfo) 1029 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1030 else 1031 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 1032 1033 FOR_EACH_VEC_ELT (datarefs, i, dr) 1034 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 1035 && !vect_compute_data_ref_alignment (dr)) 1036 { 1037 if (bb_vinfo) 1038 { 1039 /* Mark unsupported statement as unvectorizable. */ 1040 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 1041 continue; 1042 } 1043 else 1044 return false; 1045 } 1046 1047 return true; 1048 } 1049 1050 1051 /* Function vect_update_misalignment_for_peel 1052 1053 DR - the data reference whose misalignment is to be adjusted. 1054 DR_PEEL - the data reference whose misalignment is being made 1055 zero in the vector loop by the peel. 1056 NPEEL - the number of iterations in the peel loop if the misalignment 1057 of DR_PEEL is known at compile time. */ 1058 1059 static void 1060 vect_update_misalignment_for_peel (struct data_reference *dr, 1061 struct data_reference *dr_peel, int npeel) 1062 { 1063 unsigned int i; 1064 vec<dr_p> same_align_drs; 1065 struct data_reference *current_dr; 1066 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); 1067 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); 1068 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); 1069 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); 1070 1071 /* For interleaved data accesses the step in the loop must be multiplied by 1072 the size of the interleaving group. */ 1073 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1074 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); 1075 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) 1076 dr_peel_size *= GROUP_SIZE (peel_stmt_info); 1077 1078 /* It can be assumed that the data refs with the same alignment as dr_peel 1079 are aligned in the vector loop. */ 1080 same_align_drs 1081 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); 1082 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) 1083 { 1084 if (current_dr != dr) 1085 continue; 1086 gcc_assert (DR_MISALIGNMENT (dr) / dr_size == 1087 DR_MISALIGNMENT (dr_peel) / dr_peel_size); 1088 SET_DR_MISALIGNMENT (dr, 0); 1089 return; 1090 } 1091 1092 if (known_alignment_for_access_p (dr) 1093 && known_alignment_for_access_p (dr_peel)) 1094 { 1095 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; 1096 int misal = DR_MISALIGNMENT (dr); 1097 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1098 misal += negative ? -npeel * dr_size : npeel * dr_size; 1099 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; 1100 SET_DR_MISALIGNMENT (dr, misal); 1101 return; 1102 } 1103 1104 if (dump_enabled_p ()) 1105 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1."); 1106 SET_DR_MISALIGNMENT (dr, -1); 1107 } 1108 1109 1110 /* Function vect_verify_datarefs_alignment 1111 1112 Return TRUE if all data references in the loop can be 1113 handled with respect to alignment. */ 1114 1115 bool 1116 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 1117 { 1118 vec<data_reference_p> datarefs; 1119 struct data_reference *dr; 1120 enum dr_alignment_support supportable_dr_alignment; 1121 unsigned int i; 1122 1123 if (loop_vinfo) 1124 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1125 else 1126 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 1127 1128 FOR_EACH_VEC_ELT (datarefs, i, dr) 1129 { 1130 gimple stmt = DR_STMT (dr); 1131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1132 1133 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1134 continue; 1135 1136 /* For interleaving, only the alignment of the first access matters. 1137 Skip statements marked as not vectorizable. */ 1138 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info) 1139 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1140 || !STMT_VINFO_VECTORIZABLE (stmt_info)) 1141 continue; 1142 1143 /* Strided loads perform only component accesses, alignment is 1144 irrelevant for them. */ 1145 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1146 continue; 1147 1148 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1149 if (!supportable_dr_alignment) 1150 { 1151 if (dump_enabled_p ()) 1152 { 1153 if (DR_IS_READ (dr)) 1154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1155 "not vectorized: unsupported unaligned load."); 1156 else 1157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1158 "not vectorized: unsupported unaligned " 1159 "store."); 1160 1161 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 1162 DR_REF (dr)); 1163 } 1164 return false; 1165 } 1166 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1167 dump_printf_loc (MSG_NOTE, vect_location, 1168 "Vectorizing an unaligned access."); 1169 } 1170 return true; 1171 } 1172 1173 /* Given an memory reference EXP return whether its alignment is less 1174 than its size. */ 1175 1176 static bool 1177 not_size_aligned (tree exp) 1178 { 1179 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1)) 1180 return true; 1181 1182 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp))) 1183 > get_object_alignment (exp)); 1184 } 1185 1186 /* Function vector_alignment_reachable_p 1187 1188 Return true if vector alignment for DR is reachable by peeling 1189 a few loop iterations. Return false otherwise. */ 1190 1191 static bool 1192 vector_alignment_reachable_p (struct data_reference *dr) 1193 { 1194 gimple stmt = DR_STMT (dr); 1195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1196 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1197 1198 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1199 { 1200 /* For interleaved access we peel only if number of iterations in 1201 the prolog loop ({VF - misalignment}), is a multiple of the 1202 number of the interleaved accesses. */ 1203 int elem_size, mis_in_elements; 1204 int nelements = TYPE_VECTOR_SUBPARTS (vectype); 1205 1206 /* FORNOW: handle only known alignment. */ 1207 if (!known_alignment_for_access_p (dr)) 1208 return false; 1209 1210 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; 1211 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; 1212 1213 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) 1214 return false; 1215 } 1216 1217 /* If misalignment is known at the compile time then allow peeling 1218 only if natural alignment is reachable through peeling. */ 1219 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) 1220 { 1221 HOST_WIDE_INT elmsize = 1222 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1223 if (dump_enabled_p ()) 1224 { 1225 dump_printf_loc (MSG_NOTE, vect_location, 1226 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); 1227 dump_printf (MSG_NOTE, 1228 ". misalignment = %d. ", DR_MISALIGNMENT (dr)); 1229 } 1230 if (DR_MISALIGNMENT (dr) % elmsize) 1231 { 1232 if (dump_enabled_p ()) 1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1234 "data size does not divide the misalignment.\n"); 1235 return false; 1236 } 1237 } 1238 1239 if (!known_alignment_for_access_p (dr)) 1240 { 1241 tree type = TREE_TYPE (DR_REF (dr)); 1242 bool is_packed = not_size_aligned (DR_REF (dr)); 1243 if (dump_enabled_p ()) 1244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1245 "Unknown misalignment, is_packed = %d",is_packed); 1246 if (targetm.vectorize.vector_alignment_reachable (type, is_packed)) 1247 return true; 1248 else 1249 return false; 1250 } 1251 1252 return true; 1253 } 1254 1255 1256 /* Calculate the cost of the memory access represented by DR. */ 1257 1258 static void 1259 vect_get_data_access_cost (struct data_reference *dr, 1260 unsigned int *inside_cost, 1261 unsigned int *outside_cost, 1262 stmt_vector_for_cost *body_cost_vec) 1263 { 1264 gimple stmt = DR_STMT (dr); 1265 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1266 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 1267 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1268 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1269 int ncopies = vf / nunits; 1270 1271 if (DR_IS_READ (dr)) 1272 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, 1273 NULL, body_cost_vec, false); 1274 else 1275 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); 1276 1277 if (dump_enabled_p ()) 1278 dump_printf_loc (MSG_NOTE, vect_location, 1279 "vect_get_data_access_cost: inside_cost = %d, " 1280 "outside_cost = %d.", *inside_cost, *outside_cost); 1281 } 1282 1283 1284 static hashval_t 1285 vect_peeling_hash (const void *elem) 1286 { 1287 const struct _vect_peel_info *peel_info; 1288 1289 peel_info = (const struct _vect_peel_info *) elem; 1290 return (hashval_t) peel_info->npeel; 1291 } 1292 1293 1294 static int 1295 vect_peeling_hash_eq (const void *elem1, const void *elem2) 1296 { 1297 const struct _vect_peel_info *a, *b; 1298 1299 a = (const struct _vect_peel_info *) elem1; 1300 b = (const struct _vect_peel_info *) elem2; 1301 return (a->npeel == b->npeel); 1302 } 1303 1304 1305 /* Insert DR into peeling hash table with NPEEL as key. */ 1306 1307 static void 1308 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr, 1309 int npeel) 1310 { 1311 struct _vect_peel_info elem, *slot; 1312 void **new_slot; 1313 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1314 1315 elem.npeel = npeel; 1316 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo), 1317 &elem); 1318 if (slot) 1319 slot->count++; 1320 else 1321 { 1322 slot = XNEW (struct _vect_peel_info); 1323 slot->npeel = npeel; 1324 slot->dr = dr; 1325 slot->count = 1; 1326 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot, 1327 INSERT); 1328 *new_slot = slot; 1329 } 1330 1331 if (!supportable_dr_alignment && !flag_vect_cost_model) 1332 slot->count += VECT_MAX_COST; 1333 } 1334 1335 1336 /* Traverse peeling hash table to find peeling option that aligns maximum 1337 number of data accesses. */ 1338 1339 static int 1340 vect_peeling_hash_get_most_frequent (void **slot, void *data) 1341 { 1342 vect_peel_info elem = (vect_peel_info) *slot; 1343 vect_peel_extended_info max = (vect_peel_extended_info) data; 1344 1345 if (elem->count > max->peel_info.count 1346 || (elem->count == max->peel_info.count 1347 && max->peel_info.npeel > elem->npeel)) 1348 { 1349 max->peel_info.npeel = elem->npeel; 1350 max->peel_info.count = elem->count; 1351 max->peel_info.dr = elem->dr; 1352 } 1353 1354 return 1; 1355 } 1356 1357 1358 /* Traverse peeling hash table and calculate cost for each peeling option. 1359 Find the one with the lowest cost. */ 1360 1361 static int 1362 vect_peeling_hash_get_lowest_cost (void **slot, void *data) 1363 { 1364 vect_peel_info elem = (vect_peel_info) *slot; 1365 vect_peel_extended_info min = (vect_peel_extended_info) data; 1366 int save_misalignment, dummy; 1367 unsigned int inside_cost = 0, outside_cost = 0, i; 1368 gimple stmt = DR_STMT (elem->dr); 1369 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 1370 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1371 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1372 struct data_reference *dr; 1373 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; 1374 int single_iter_cost; 1375 1376 prologue_cost_vec.create (2); 1377 body_cost_vec.create (2); 1378 epilogue_cost_vec.create (2); 1379 1380 FOR_EACH_VEC_ELT (datarefs, i, dr) 1381 { 1382 stmt = DR_STMT (dr); 1383 stmt_info = vinfo_for_stmt (stmt); 1384 /* For interleaving, only the alignment of the first access 1385 matters. */ 1386 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1387 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1388 continue; 1389 1390 save_misalignment = DR_MISALIGNMENT (dr); 1391 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); 1392 vect_get_data_access_cost (dr, &inside_cost, &outside_cost, 1393 &body_cost_vec); 1394 SET_DR_MISALIGNMENT (dr, save_misalignment); 1395 } 1396 1397 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo); 1398 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel, 1399 &dummy, single_iter_cost, 1400 &prologue_cost_vec, 1401 &epilogue_cost_vec); 1402 1403 /* Prologue and epilogue costs are added to the target model later. 1404 These costs depend only on the scalar iteration cost, the 1405 number of peeling iterations finally chosen, and the number of 1406 misaligned statements. So discard the information found here. */ 1407 prologue_cost_vec.release (); 1408 epilogue_cost_vec.release (); 1409 1410 if (inside_cost < min->inside_cost 1411 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) 1412 { 1413 min->inside_cost = inside_cost; 1414 min->outside_cost = outside_cost; 1415 min->body_cost_vec.release (); 1416 min->body_cost_vec = body_cost_vec; 1417 min->peel_info.dr = elem->dr; 1418 min->peel_info.npeel = elem->npeel; 1419 } 1420 else 1421 body_cost_vec.release (); 1422 1423 return 1; 1424 } 1425 1426 1427 /* Choose best peeling option by traversing peeling hash table and either 1428 choosing an option with the lowest cost (if cost model is enabled) or the 1429 option that aligns as many accesses as possible. */ 1430 1431 static struct data_reference * 1432 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo, 1433 unsigned int *npeel, 1434 stmt_vector_for_cost *body_cost_vec) 1435 { 1436 struct _vect_peel_extended_info res; 1437 1438 res.peel_info.dr = NULL; 1439 res.body_cost_vec = stmt_vector_for_cost(); 1440 1441 if (flag_vect_cost_model) 1442 { 1443 res.inside_cost = INT_MAX; 1444 res.outside_cost = INT_MAX; 1445 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo), 1446 vect_peeling_hash_get_lowest_cost, &res); 1447 } 1448 else 1449 { 1450 res.peel_info.count = 0; 1451 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo), 1452 vect_peeling_hash_get_most_frequent, &res); 1453 } 1454 1455 *npeel = res.peel_info.npeel; 1456 *body_cost_vec = res.body_cost_vec; 1457 return res.peel_info.dr; 1458 } 1459 1460 1461 /* Function vect_enhance_data_refs_alignment 1462 1463 This pass will use loop versioning and loop peeling in order to enhance 1464 the alignment of data references in the loop. 1465 1466 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1467 original loop is to be vectorized. Any other loops that are created by 1468 the transformations performed in this pass - are not supposed to be 1469 vectorized. This restriction will be relaxed. 1470 1471 This pass will require a cost model to guide it whether to apply peeling 1472 or versioning or a combination of the two. For example, the scheme that 1473 intel uses when given a loop with several memory accesses, is as follows: 1474 choose one memory access ('p') which alignment you want to force by doing 1475 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1476 other accesses are not necessarily aligned, or (2) use loop versioning to 1477 generate one loop in which all accesses are aligned, and another loop in 1478 which only 'p' is necessarily aligned. 1479 1480 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1481 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1482 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1483 1484 Devising a cost model is the most critical aspect of this work. It will 1485 guide us on which access to peel for, whether to use loop versioning, how 1486 many versions to create, etc. The cost model will probably consist of 1487 generic considerations as well as target specific considerations (on 1488 powerpc for example, misaligned stores are more painful than misaligned 1489 loads). 1490 1491 Here are the general steps involved in alignment enhancements: 1492 1493 -- original loop, before alignment analysis: 1494 for (i=0; i<N; i++){ 1495 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1496 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1497 } 1498 1499 -- After vect_compute_data_refs_alignment: 1500 for (i=0; i<N; i++){ 1501 x = q[i]; # DR_MISALIGNMENT(q) = 3 1502 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1503 } 1504 1505 -- Possibility 1: we do loop versioning: 1506 if (p is aligned) { 1507 for (i=0; i<N; i++){ # loop 1A 1508 x = q[i]; # DR_MISALIGNMENT(q) = 3 1509 p[i] = y; # DR_MISALIGNMENT(p) = 0 1510 } 1511 } 1512 else { 1513 for (i=0; i<N; i++){ # loop 1B 1514 x = q[i]; # DR_MISALIGNMENT(q) = 3 1515 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1516 } 1517 } 1518 1519 -- Possibility 2: we do loop peeling: 1520 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1521 x = q[i]; 1522 p[i] = y; 1523 } 1524 for (i = 3; i < N; i++){ # loop 2A 1525 x = q[i]; # DR_MISALIGNMENT(q) = 0 1526 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1527 } 1528 1529 -- Possibility 3: combination of loop peeling and versioning: 1530 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1531 x = q[i]; 1532 p[i] = y; 1533 } 1534 if (p is aligned) { 1535 for (i = 3; i<N; i++){ # loop 3A 1536 x = q[i]; # DR_MISALIGNMENT(q) = 0 1537 p[i] = y; # DR_MISALIGNMENT(p) = 0 1538 } 1539 } 1540 else { 1541 for (i = 3; i<N; i++){ # loop 3B 1542 x = q[i]; # DR_MISALIGNMENT(q) = 0 1543 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1544 } 1545 } 1546 1547 These loops are later passed to loop_transform to be vectorized. The 1548 vectorizer will use the alignment information to guide the transformation 1549 (whether to generate regular loads/stores, or with special handling for 1550 misalignment). */ 1551 1552 bool 1553 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1554 { 1555 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1556 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1557 enum dr_alignment_support supportable_dr_alignment; 1558 struct data_reference *dr0 = NULL, *first_store = NULL; 1559 struct data_reference *dr; 1560 unsigned int i, j; 1561 bool do_peeling = false; 1562 bool do_versioning = false; 1563 bool stat; 1564 gimple stmt; 1565 stmt_vec_info stmt_info; 1566 int vect_versioning_for_alias_required; 1567 unsigned int npeel = 0; 1568 bool all_misalignments_unknown = true; 1569 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1570 unsigned possible_npeel_number = 1; 1571 tree vectype; 1572 unsigned int nelements, mis, same_align_drs_max = 0; 1573 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost(); 1574 1575 if (dump_enabled_p ()) 1576 dump_printf_loc (MSG_NOTE, vect_location, 1577 "=== vect_enhance_data_refs_alignment ==="); 1578 1579 /* While cost model enhancements are expected in the future, the high level 1580 view of the code at this time is as follows: 1581 1582 A) If there is a misaligned access then see if peeling to align 1583 this access can make all data references satisfy 1584 vect_supportable_dr_alignment. If so, update data structures 1585 as needed and return true. 1586 1587 B) If peeling wasn't possible and there is a data reference with an 1588 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1589 then see if loop versioning checks can be used to make all data 1590 references satisfy vect_supportable_dr_alignment. If so, update 1591 data structures as needed and return true. 1592 1593 C) If neither peeling nor versioning were successful then return false if 1594 any data reference does not satisfy vect_supportable_dr_alignment. 1595 1596 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1597 1598 Note, Possibility 3 above (which is peeling and versioning together) is not 1599 being done at this time. */ 1600 1601 /* (1) Peeling to force alignment. */ 1602 1603 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1604 Considerations: 1605 + How many accesses will become aligned due to the peeling 1606 - How many accesses will become unaligned due to the peeling, 1607 and the cost of misaligned accesses. 1608 - The cost of peeling (the extra runtime checks, the increase 1609 in code size). */ 1610 1611 FOR_EACH_VEC_ELT (datarefs, i, dr) 1612 { 1613 stmt = DR_STMT (dr); 1614 stmt_info = vinfo_for_stmt (stmt); 1615 1616 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1617 continue; 1618 1619 /* For interleaving, only the alignment of the first access 1620 matters. */ 1621 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1622 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1623 continue; 1624 1625 /* For invariant accesses there is nothing to enhance. */ 1626 if (integer_zerop (DR_STEP (dr))) 1627 continue; 1628 1629 /* Strided loads perform only component accesses, alignment is 1630 irrelevant for them. */ 1631 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1632 continue; 1633 1634 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); 1635 do_peeling = vector_alignment_reachable_p (dr); 1636 if (do_peeling) 1637 { 1638 if (known_alignment_for_access_p (dr)) 1639 { 1640 unsigned int npeel_tmp; 1641 bool negative = tree_int_cst_compare (DR_STEP (dr), 1642 size_zero_node) < 0; 1643 1644 /* Save info about DR in the hash table. */ 1645 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo)) 1646 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = 1647 htab_create (1, vect_peeling_hash, 1648 vect_peeling_hash_eq, free); 1649 1650 vectype = STMT_VINFO_VECTYPE (stmt_info); 1651 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1652 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( 1653 TREE_TYPE (DR_REF (dr)))); 1654 npeel_tmp = (negative 1655 ? (mis - nelements) : (nelements - mis)) 1656 & (nelements - 1); 1657 1658 /* For multiple types, it is possible that the bigger type access 1659 will have more than one peeling option. E.g., a loop with two 1660 types: one of size (vector size / 4), and the other one of 1661 size (vector size / 8). Vectorization factor will 8. If both 1662 access are misaligned by 3, the first one needs one scalar 1663 iteration to be aligned, and the second one needs 5. But the 1664 the first one will be aligned also by peeling 5 scalar 1665 iterations, and in that case both accesses will be aligned. 1666 Hence, except for the immediate peeling amount, we also want 1667 to try to add full vector size, while we don't exceed 1668 vectorization factor. 1669 We do this automtically for cost model, since we calculate cost 1670 for every peeling option. */ 1671 if (!flag_vect_cost_model) 1672 possible_npeel_number = vf /nelements; 1673 1674 /* Handle the aligned case. We may decide to align some other 1675 access, making DR unaligned. */ 1676 if (DR_MISALIGNMENT (dr) == 0) 1677 { 1678 npeel_tmp = 0; 1679 if (!flag_vect_cost_model) 1680 possible_npeel_number++; 1681 } 1682 1683 for (j = 0; j < possible_npeel_number; j++) 1684 { 1685 gcc_assert (npeel_tmp <= vf); 1686 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp); 1687 npeel_tmp += nelements; 1688 } 1689 1690 all_misalignments_unknown = false; 1691 /* Data-ref that was chosen for the case that all the 1692 misalignments are unknown is not relevant anymore, since we 1693 have a data-ref with known alignment. */ 1694 dr0 = NULL; 1695 } 1696 else 1697 { 1698 /* If we don't know all the misalignment values, we prefer 1699 peeling for data-ref that has maximum number of data-refs 1700 with the same alignment, unless the target prefers to align 1701 stores over load. */ 1702 if (all_misalignments_unknown) 1703 { 1704 if (same_align_drs_max 1705 < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length () 1706 || !dr0) 1707 { 1708 same_align_drs_max 1709 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1710 dr0 = dr; 1711 } 1712 1713 if (!first_store && DR_IS_WRITE (dr)) 1714 first_store = dr; 1715 } 1716 1717 /* If there are both known and unknown misaligned accesses in the 1718 loop, we choose peeling amount according to the known 1719 accesses. */ 1720 1721 1722 if (!supportable_dr_alignment) 1723 { 1724 dr0 = dr; 1725 if (!first_store && DR_IS_WRITE (dr)) 1726 first_store = dr; 1727 } 1728 } 1729 } 1730 else 1731 { 1732 if (!aligned_access_p (dr)) 1733 { 1734 if (dump_enabled_p ()) 1735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1736 "vector alignment may not be reachable"); 1737 break; 1738 } 1739 } 1740 } 1741 1742 vect_versioning_for_alias_required 1743 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); 1744 1745 /* Temporarily, if versioning for alias is required, we disable peeling 1746 until we support peeling and versioning. Often peeling for alignment 1747 will require peeling for loop-bound, which in turn requires that we 1748 know how to adjust the loop ivs after the loop. */ 1749 if (vect_versioning_for_alias_required 1750 || !vect_can_advance_ivs_p (loop_vinfo) 1751 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))) 1752 do_peeling = false; 1753 1754 if (do_peeling && all_misalignments_unknown 1755 && vect_supportable_dr_alignment (dr0, false)) 1756 { 1757 1758 /* Check if the target requires to prefer stores over loads, i.e., if 1759 misaligned stores are more expensive than misaligned loads (taking 1760 drs with same alignment into account). */ 1761 if (first_store && DR_IS_READ (dr0)) 1762 { 1763 unsigned int load_inside_cost = 0, load_outside_cost = 0; 1764 unsigned int store_inside_cost = 0, store_outside_cost = 0; 1765 unsigned int load_inside_penalty = 0, load_outside_penalty = 0; 1766 unsigned int store_inside_penalty = 0, store_outside_penalty = 0; 1767 stmt_vector_for_cost dummy; 1768 dummy.create (2); 1769 1770 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, 1771 &dummy); 1772 vect_get_data_access_cost (first_store, &store_inside_cost, 1773 &store_outside_cost, &dummy); 1774 1775 dummy.release (); 1776 1777 /* Calculate the penalty for leaving FIRST_STORE unaligned (by 1778 aligning the load DR0). */ 1779 load_inside_penalty = store_inside_cost; 1780 load_outside_penalty = store_outside_cost; 1781 for (i = 0; 1782 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1783 DR_STMT (first_store))).iterate (i, &dr); 1784 i++) 1785 if (DR_IS_READ (dr)) 1786 { 1787 load_inside_penalty += load_inside_cost; 1788 load_outside_penalty += load_outside_cost; 1789 } 1790 else 1791 { 1792 load_inside_penalty += store_inside_cost; 1793 load_outside_penalty += store_outside_cost; 1794 } 1795 1796 /* Calculate the penalty for leaving DR0 unaligned (by 1797 aligning the FIRST_STORE). */ 1798 store_inside_penalty = load_inside_cost; 1799 store_outside_penalty = load_outside_cost; 1800 for (i = 0; 1801 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( 1802 DR_STMT (dr0))).iterate (i, &dr); 1803 i++) 1804 if (DR_IS_READ (dr)) 1805 { 1806 store_inside_penalty += load_inside_cost; 1807 store_outside_penalty += load_outside_cost; 1808 } 1809 else 1810 { 1811 store_inside_penalty += store_inside_cost; 1812 store_outside_penalty += store_outside_cost; 1813 } 1814 1815 if (load_inside_penalty > store_inside_penalty 1816 || (load_inside_penalty == store_inside_penalty 1817 && load_outside_penalty > store_outside_penalty)) 1818 dr0 = first_store; 1819 } 1820 1821 /* In case there are only loads with different unknown misalignments, use 1822 peeling only if it may help to align other accesses in the loop. */ 1823 if (!first_store 1824 && !STMT_VINFO_SAME_ALIGN_REFS ( 1825 vinfo_for_stmt (DR_STMT (dr0))).length () 1826 && vect_supportable_dr_alignment (dr0, false) 1827 != dr_unaligned_supported) 1828 do_peeling = false; 1829 } 1830 1831 if (do_peeling && !dr0) 1832 { 1833 /* Peeling is possible, but there is no data access that is not supported 1834 unless aligned. So we try to choose the best possible peeling. */ 1835 1836 /* We should get here only if there are drs with known misalignment. */ 1837 gcc_assert (!all_misalignments_unknown); 1838 1839 /* Choose the best peeling from the hash table. */ 1840 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel, 1841 &body_cost_vec); 1842 if (!dr0 || !npeel) 1843 do_peeling = false; 1844 } 1845 1846 if (do_peeling) 1847 { 1848 stmt = DR_STMT (dr0); 1849 stmt_info = vinfo_for_stmt (stmt); 1850 vectype = STMT_VINFO_VECTYPE (stmt_info); 1851 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1852 1853 if (known_alignment_for_access_p (dr0)) 1854 { 1855 bool negative = tree_int_cst_compare (DR_STEP (dr0), 1856 size_zero_node) < 0; 1857 if (!npeel) 1858 { 1859 /* Since it's known at compile time, compute the number of 1860 iterations in the peeled loop (the peeling factor) for use in 1861 updating DR_MISALIGNMENT values. The peeling factor is the 1862 vectorization factor minus the misalignment as an element 1863 count. */ 1864 mis = DR_MISALIGNMENT (dr0); 1865 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); 1866 npeel = ((negative ? mis - nelements : nelements - mis) 1867 & (nelements - 1)); 1868 } 1869 1870 /* For interleaved data access every iteration accesses all the 1871 members of the group, therefore we divide the number of iterations 1872 by the group size. */ 1873 stmt_info = vinfo_for_stmt (DR_STMT (dr0)); 1874 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1875 npeel /= GROUP_SIZE (stmt_info); 1876 1877 if (dump_enabled_p ()) 1878 dump_printf_loc (MSG_NOTE, vect_location, 1879 "Try peeling by %d", npeel); 1880 } 1881 1882 /* Ensure that all data refs can be vectorized after the peel. */ 1883 FOR_EACH_VEC_ELT (datarefs, i, dr) 1884 { 1885 int save_misalignment; 1886 1887 if (dr == dr0) 1888 continue; 1889 1890 stmt = DR_STMT (dr); 1891 stmt_info = vinfo_for_stmt (stmt); 1892 /* For interleaving, only the alignment of the first access 1893 matters. */ 1894 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1895 && GROUP_FIRST_ELEMENT (stmt_info) != stmt) 1896 continue; 1897 1898 /* Strided loads perform only component accesses, alignment is 1899 irrelevant for them. */ 1900 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 1901 continue; 1902 1903 save_misalignment = DR_MISALIGNMENT (dr); 1904 vect_update_misalignment_for_peel (dr, dr0, npeel); 1905 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 1906 SET_DR_MISALIGNMENT (dr, save_misalignment); 1907 1908 if (!supportable_dr_alignment) 1909 { 1910 do_peeling = false; 1911 break; 1912 } 1913 } 1914 1915 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) 1916 { 1917 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1918 if (!stat) 1919 do_peeling = false; 1920 else 1921 { 1922 body_cost_vec.release (); 1923 return stat; 1924 } 1925 } 1926 1927 if (do_peeling) 1928 { 1929 stmt_info_for_cost *si; 1930 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 1931 1932 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 1933 If the misalignment of DR_i is identical to that of dr0 then set 1934 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 1935 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 1936 by the peeling factor times the element size of DR_i (MOD the 1937 vectorization factor times the size). Otherwise, the 1938 misalignment of DR_i must be set to unknown. */ 1939 FOR_EACH_VEC_ELT (datarefs, i, dr) 1940 if (dr != dr0) 1941 vect_update_misalignment_for_peel (dr, dr0, npeel); 1942 1943 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; 1944 if (npeel) 1945 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 1946 else 1947 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0); 1948 SET_DR_MISALIGNMENT (dr0, 0); 1949 if (dump_enabled_p ()) 1950 { 1951 dump_printf_loc (MSG_NOTE, vect_location, 1952 "Alignment of access forced using peeling."); 1953 dump_printf_loc (MSG_NOTE, vect_location, 1954 "Peeling for alignment will be applied."); 1955 } 1956 /* We've delayed passing the inside-loop peeling costs to the 1957 target cost model until we were sure peeling would happen. 1958 Do so now. */ 1959 if (body_cost_vec.exists ()) 1960 { 1961 FOR_EACH_VEC_ELT (body_cost_vec, i, si) 1962 { 1963 struct _stmt_vec_info *stmt_info 1964 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL; 1965 (void) add_stmt_cost (data, si->count, si->kind, stmt_info, 1966 si->misalign, vect_body); 1967 } 1968 body_cost_vec.release (); 1969 } 1970 1971 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 1972 gcc_assert (stat); 1973 return stat; 1974 } 1975 } 1976 1977 body_cost_vec.release (); 1978 1979 /* (2) Versioning to force alignment. */ 1980 1981 /* Try versioning if: 1982 1) flag_tree_vect_loop_version is TRUE 1983 2) optimize loop for speed 1984 3) there is at least one unsupported misaligned data ref with an unknown 1985 misalignment, and 1986 4) all misaligned data refs with a known misalignment are supported, and 1987 5) the number of runtime alignment checks is within reason. */ 1988 1989 do_versioning = 1990 flag_tree_vect_loop_version 1991 && optimize_loop_nest_for_speed_p (loop) 1992 && (!loop->inner); /* FORNOW */ 1993 1994 if (do_versioning) 1995 { 1996 FOR_EACH_VEC_ELT (datarefs, i, dr) 1997 { 1998 stmt = DR_STMT (dr); 1999 stmt_info = vinfo_for_stmt (stmt); 2000 2001 /* For interleaving, only the alignment of the first access 2002 matters. */ 2003 if (aligned_access_p (dr) 2004 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2005 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) 2006 continue; 2007 2008 /* Strided loads perform only component accesses, alignment is 2009 irrelevant for them. */ 2010 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) 2011 continue; 2012 2013 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); 2014 2015 if (!supportable_dr_alignment) 2016 { 2017 gimple stmt; 2018 int mask; 2019 tree vectype; 2020 2021 if (known_alignment_for_access_p (dr) 2022 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2023 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) 2024 { 2025 do_versioning = false; 2026 break; 2027 } 2028 2029 stmt = DR_STMT (dr); 2030 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 2031 gcc_assert (vectype); 2032 2033 /* The rightmost bits of an aligned address must be zeros. 2034 Construct the mask needed for this test. For example, 2035 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2036 mask must be 15 = 0xf. */ 2037 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; 2038 2039 /* FORNOW: use the same mask to test all potentially unaligned 2040 references in the loop. The vectorizer currently supports 2041 a single vector size, see the reference to 2042 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the 2043 vectorization factor is computed. */ 2044 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) 2045 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); 2046 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2047 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( 2048 DR_STMT (dr)); 2049 } 2050 } 2051 2052 /* Versioning requires at least one misaligned data reference. */ 2053 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2054 do_versioning = false; 2055 else if (!do_versioning) 2056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2057 } 2058 2059 if (do_versioning) 2060 { 2061 vec<gimple> may_misalign_stmts 2062 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2063 gimple stmt; 2064 2065 /* It can now be assumed that the data references in the statements 2066 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2067 of the loop being vectorized. */ 2068 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) 2069 { 2070 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2071 dr = STMT_VINFO_DATA_REF (stmt_info); 2072 SET_DR_MISALIGNMENT (dr, 0); 2073 if (dump_enabled_p ()) 2074 dump_printf_loc (MSG_NOTE, vect_location, 2075 "Alignment of access forced using versioning."); 2076 } 2077 2078 if (dump_enabled_p ()) 2079 dump_printf_loc (MSG_NOTE, vect_location, 2080 "Versioning for alignment will be applied."); 2081 2082 /* Peeling and versioning can't be done together at this time. */ 2083 gcc_assert (! (do_peeling && do_versioning)); 2084 2085 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 2086 gcc_assert (stat); 2087 return stat; 2088 } 2089 2090 /* This point is reached if neither peeling nor versioning is being done. */ 2091 gcc_assert (! (do_peeling || do_versioning)); 2092 2093 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); 2094 return stat; 2095 } 2096 2097 2098 /* Function vect_find_same_alignment_drs. 2099 2100 Update group and alignment relations according to the chosen 2101 vectorization factor. */ 2102 2103 static void 2104 vect_find_same_alignment_drs (struct data_dependence_relation *ddr, 2105 loop_vec_info loop_vinfo) 2106 { 2107 unsigned int i; 2108 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2109 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2110 struct data_reference *dra = DDR_A (ddr); 2111 struct data_reference *drb = DDR_B (ddr); 2112 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); 2113 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); 2114 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); 2115 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); 2116 lambda_vector dist_v; 2117 unsigned int loop_depth; 2118 2119 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2120 return; 2121 2122 if (dra == drb) 2123 return; 2124 2125 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 2126 return; 2127 2128 /* Loop-based vectorization and known data dependence. */ 2129 if (DDR_NUM_DIST_VECTS (ddr) == 0) 2130 return; 2131 2132 /* Data-dependence analysis reports a distance vector of zero 2133 for data-references that overlap only in the first iteration 2134 but have different sign step (see PR45764). 2135 So as a sanity check require equal DR_STEP. */ 2136 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2137 return; 2138 2139 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 2140 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 2141 { 2142 int dist = dist_v[loop_depth]; 2143 2144 if (dump_enabled_p ()) 2145 dump_printf_loc (MSG_NOTE, vect_location, 2146 "dependence distance = %d.", dist); 2147 2148 /* Same loop iteration. */ 2149 if (dist == 0 2150 || (dist % vectorization_factor == 0 && dra_size == drb_size)) 2151 { 2152 /* Two references with distance zero have the same alignment. */ 2153 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2154 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2155 if (dump_enabled_p ()) 2156 { 2157 dump_printf_loc (MSG_NOTE, vect_location, 2158 "accesses have the same alignment."); 2159 dump_printf (MSG_NOTE, 2160 "dependence distance modulo vf == 0 between "); 2161 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); 2162 dump_printf (MSG_NOTE, " and "); 2163 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); 2164 } 2165 } 2166 } 2167 } 2168 2169 2170 /* Function vect_analyze_data_refs_alignment 2171 2172 Analyze the alignment of the data-references in the loop. 2173 Return FALSE if a data reference is found that cannot be vectorized. */ 2174 2175 bool 2176 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo, 2177 bb_vec_info bb_vinfo) 2178 { 2179 if (dump_enabled_p ()) 2180 dump_printf_loc (MSG_NOTE, vect_location, 2181 "=== vect_analyze_data_refs_alignment ==="); 2182 2183 /* Mark groups of data references with same alignment using 2184 data dependence information. */ 2185 if (loop_vinfo) 2186 { 2187 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo); 2188 struct data_dependence_relation *ddr; 2189 unsigned int i; 2190 2191 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2192 vect_find_same_alignment_drs (ddr, loop_vinfo); 2193 } 2194 2195 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo)) 2196 { 2197 if (dump_enabled_p ()) 2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2199 "not vectorized: can't calculate alignment " 2200 "for data ref."); 2201 return false; 2202 } 2203 2204 return true; 2205 } 2206 2207 2208 /* Analyze groups of accesses: check that DR belongs to a group of 2209 accesses of legal size, step, etc. Detect gaps, single element 2210 interleaving, and other special cases. Set grouped access info. 2211 Collect groups of strided stores for further use in SLP analysis. */ 2212 2213 static bool 2214 vect_analyze_group_access (struct data_reference *dr) 2215 { 2216 tree step = DR_STEP (dr); 2217 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2218 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2219 gimple stmt = DR_STMT (dr); 2220 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2221 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2222 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2223 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2224 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2225 bool slp_impossible = false; 2226 struct loop *loop = NULL; 2227 2228 if (loop_vinfo) 2229 loop = LOOP_VINFO_LOOP (loop_vinfo); 2230 2231 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2232 size of the interleaving group (including gaps). */ 2233 groupsize = dr_step / type_size; 2234 2235 /* Not consecutive access is possible only if it is a part of interleaving. */ 2236 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) 2237 { 2238 /* Check if it this DR is a part of interleaving, and is a single 2239 element of the group that is accessed in the loop. */ 2240 2241 /* Gaps are supported only for loads. STEP must be a multiple of the type 2242 size. The size of the group must be a power of 2. */ 2243 if (DR_IS_READ (dr) 2244 && (dr_step % type_size) == 0 2245 && groupsize > 0 2246 && exact_log2 (groupsize) != -1) 2247 { 2248 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; 2249 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2250 if (dump_enabled_p ()) 2251 { 2252 dump_printf_loc (MSG_NOTE, vect_location, 2253 "Detected single element interleaving "); 2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); 2255 dump_printf (MSG_NOTE, " step "); 2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, step); 2257 } 2258 2259 if (loop_vinfo) 2260 { 2261 if (dump_enabled_p ()) 2262 dump_printf_loc (MSG_NOTE, vect_location, 2263 "Data access with gaps requires scalar " 2264 "epilogue loop"); 2265 if (loop->inner) 2266 { 2267 if (dump_enabled_p ()) 2268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2269 "Peeling for outer loop is not" 2270 " supported"); 2271 return false; 2272 } 2273 2274 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; 2275 } 2276 2277 return true; 2278 } 2279 2280 if (dump_enabled_p ()) 2281 { 2282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2283 "not consecutive access "); 2284 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 2285 } 2286 2287 if (bb_vinfo) 2288 { 2289 /* Mark the statement as unvectorizable. */ 2290 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2291 return true; 2292 } 2293 2294 return false; 2295 } 2296 2297 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) 2298 { 2299 /* First stmt in the interleaving chain. Check the chain. */ 2300 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); 2301 struct data_reference *data_ref = dr; 2302 unsigned int count = 1; 2303 tree next_step; 2304 tree prev_init = DR_INIT (data_ref); 2305 gimple prev = stmt; 2306 HOST_WIDE_INT diff, count_in_bytes, gaps = 0; 2307 2308 while (next) 2309 { 2310 /* Skip same data-refs. In case that two or more stmts share 2311 data-ref (supported only for loads), we vectorize only the first 2312 stmt, and the rest get their vectorized loads from the first 2313 one. */ 2314 if (!tree_int_cst_compare (DR_INIT (data_ref), 2315 DR_INIT (STMT_VINFO_DATA_REF ( 2316 vinfo_for_stmt (next))))) 2317 { 2318 if (DR_IS_WRITE (data_ref)) 2319 { 2320 if (dump_enabled_p ()) 2321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2322 "Two store stmts share the same dr."); 2323 return false; 2324 } 2325 2326 /* Check that there is no load-store dependencies for this loads 2327 to prevent a case of load-store-load to the same location. */ 2328 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next)) 2329 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev))) 2330 { 2331 if (dump_enabled_p ()) 2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2333 "READ_WRITE dependence in interleaving."); 2334 return false; 2335 } 2336 2337 /* For load use the same data-ref load. */ 2338 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; 2339 2340 prev = next; 2341 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2342 continue; 2343 } 2344 2345 prev = next; 2346 2347 /* Check that all the accesses have the same STEP. */ 2348 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next))); 2349 if (tree_int_cst_compare (step, next_step)) 2350 { 2351 if (dump_enabled_p ()) 2352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2353 "not consecutive access in interleaving"); 2354 return false; 2355 } 2356 2357 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); 2358 /* Check that the distance between two accesses is equal to the type 2359 size. Otherwise, we have gaps. */ 2360 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2361 - TREE_INT_CST_LOW (prev_init)) / type_size; 2362 if (diff != 1) 2363 { 2364 /* FORNOW: SLP of accesses with gaps is not supported. */ 2365 slp_impossible = true; 2366 if (DR_IS_WRITE (data_ref)) 2367 { 2368 if (dump_enabled_p ()) 2369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2370 "interleaved store with gaps"); 2371 return false; 2372 } 2373 2374 gaps += diff - 1; 2375 } 2376 2377 last_accessed_element += diff; 2378 2379 /* Store the gap from the previous member of the group. If there is no 2380 gap in the access, GROUP_GAP is always 1. */ 2381 GROUP_GAP (vinfo_for_stmt (next)) = diff; 2382 2383 prev_init = DR_INIT (data_ref); 2384 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); 2385 /* Count the number of data-refs in the chain. */ 2386 count++; 2387 } 2388 2389 /* COUNT is the number of accesses found, we multiply it by the size of 2390 the type to get COUNT_IN_BYTES. */ 2391 count_in_bytes = type_size * count; 2392 2393 /* Check that the size of the interleaving (including gaps) is not 2394 greater than STEP. */ 2395 if (dr_step && dr_step < count_in_bytes + gaps * type_size) 2396 { 2397 if (dump_enabled_p ()) 2398 { 2399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2400 "interleaving size is greater than step for "); 2401 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr)); 2402 } 2403 return false; 2404 } 2405 2406 /* Check that the size of the interleaving is equal to STEP for stores, 2407 i.e., that there are no gaps. */ 2408 if (dr_step && dr_step != count_in_bytes) 2409 { 2410 if (DR_IS_READ (dr)) 2411 { 2412 slp_impossible = true; 2413 /* There is a gap after the last load in the group. This gap is a 2414 difference between the groupsize and the number of elements. 2415 When there is no gap, this difference should be 0. */ 2416 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count; 2417 } 2418 else 2419 { 2420 if (dump_enabled_p ()) 2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2422 "interleaved store with gaps"); 2423 return false; 2424 } 2425 } 2426 2427 /* Check that STEP is a multiple of type size. */ 2428 if (dr_step && (dr_step % type_size) != 0) 2429 { 2430 if (dump_enabled_p ()) 2431 { 2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2433 "step is not a multiple of type size: step "); 2434 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step); 2435 dump_printf (MSG_MISSED_OPTIMIZATION, " size "); 2436 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 2437 TYPE_SIZE_UNIT (scalar_type)); 2438 } 2439 return false; 2440 } 2441 2442 if (groupsize == 0) 2443 groupsize = count; 2444 2445 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; 2446 if (dump_enabled_p ()) 2447 dump_printf_loc (MSG_NOTE, vect_location, 2448 "Detected interleaving of size %d", (int)groupsize); 2449 2450 /* SLP: create an SLP data structure for every interleaving group of 2451 stores for further analysis in vect_analyse_slp. */ 2452 if (DR_IS_WRITE (dr) && !slp_impossible) 2453 { 2454 if (loop_vinfo) 2455 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); 2456 if (bb_vinfo) 2457 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); 2458 } 2459 2460 /* There is a gap in the end of the group. */ 2461 if (groupsize - last_accessed_element > 0 && loop_vinfo) 2462 { 2463 if (dump_enabled_p ()) 2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2465 "Data access with gaps requires scalar " 2466 "epilogue loop"); 2467 if (loop->inner) 2468 { 2469 if (dump_enabled_p ()) 2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2471 "Peeling for outer loop is not supported"); 2472 return false; 2473 } 2474 2475 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; 2476 } 2477 } 2478 2479 return true; 2480 } 2481 2482 2483 /* Analyze the access pattern of the data-reference DR. 2484 In case of non-consecutive accesses call vect_analyze_group_access() to 2485 analyze groups of accesses. */ 2486 2487 static bool 2488 vect_analyze_data_ref_access (struct data_reference *dr) 2489 { 2490 tree step = DR_STEP (dr); 2491 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2492 gimple stmt = DR_STMT (dr); 2493 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2494 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2495 struct loop *loop = NULL; 2496 2497 if (loop_vinfo) 2498 loop = LOOP_VINFO_LOOP (loop_vinfo); 2499 2500 if (loop_vinfo && !step) 2501 { 2502 if (dump_enabled_p ()) 2503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2504 "bad data-ref access in loop"); 2505 return false; 2506 } 2507 2508 /* Allow invariant loads in not nested loops. */ 2509 if (loop_vinfo && integer_zerop (step)) 2510 { 2511 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2512 if (nested_in_vect_loop_p (loop, stmt)) 2513 { 2514 if (dump_enabled_p ()) 2515 dump_printf_loc (MSG_NOTE, vect_location, 2516 "zero step in inner loop of nest"); 2517 return false; 2518 } 2519 return DR_IS_READ (dr); 2520 } 2521 2522 if (loop && nested_in_vect_loop_p (loop, stmt)) 2523 { 2524 /* Interleaved accesses are not yet supported within outer-loop 2525 vectorization for references in the inner-loop. */ 2526 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2527 2528 /* For the rest of the analysis we use the outer-loop step. */ 2529 step = STMT_VINFO_DR_STEP (stmt_info); 2530 if (integer_zerop (step)) 2531 { 2532 if (dump_enabled_p ()) 2533 dump_printf_loc (MSG_NOTE, vect_location, 2534 "zero step in outer loop."); 2535 if (DR_IS_READ (dr)) 2536 return true; 2537 else 2538 return false; 2539 } 2540 } 2541 2542 /* Consecutive? */ 2543 if (TREE_CODE (step) == INTEGER_CST) 2544 { 2545 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2546 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2547 || (dr_step < 0 2548 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2549 { 2550 /* Mark that it is not interleaving. */ 2551 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; 2552 return true; 2553 } 2554 } 2555 2556 if (loop && nested_in_vect_loop_p (loop, stmt)) 2557 { 2558 if (dump_enabled_p ()) 2559 dump_printf_loc (MSG_NOTE, vect_location, 2560 "grouped access in outer loop."); 2561 return false; 2562 } 2563 2564 /* Assume this is a DR handled by non-constant strided load case. */ 2565 if (TREE_CODE (step) != INTEGER_CST) 2566 return STMT_VINFO_STRIDE_LOAD_P (stmt_info); 2567 2568 /* Not consecutive access - check if it's a part of interleaving group. */ 2569 return vect_analyze_group_access (dr); 2570 } 2571 2572 2573 /* Function vect_analyze_data_ref_accesses. 2574 2575 Analyze the access pattern of all the data references in the loop. 2576 2577 FORNOW: the only access pattern that is considered vectorizable is a 2578 simple step 1 (consecutive) access. 2579 2580 FORNOW: handle only arrays and pointer accesses. */ 2581 2582 bool 2583 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) 2584 { 2585 unsigned int i; 2586 vec<data_reference_p> datarefs; 2587 struct data_reference *dr; 2588 2589 if (dump_enabled_p ()) 2590 dump_printf_loc (MSG_NOTE, vect_location, 2591 "=== vect_analyze_data_ref_accesses ==="); 2592 2593 if (loop_vinfo) 2594 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 2595 else 2596 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 2597 2598 FOR_EACH_VEC_ELT (datarefs, i, dr) 2599 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) 2600 && !vect_analyze_data_ref_access (dr)) 2601 { 2602 if (dump_enabled_p ()) 2603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2604 "not vectorized: complicated access pattern."); 2605 2606 if (bb_vinfo) 2607 { 2608 /* Mark the statement as not vectorizable. */ 2609 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; 2610 continue; 2611 } 2612 else 2613 return false; 2614 } 2615 2616 return true; 2617 } 2618 2619 /* Function vect_prune_runtime_alias_test_list. 2620 2621 Prune a list of ddrs to be tested at run-time by versioning for alias. 2622 Return FALSE if resulting list of ddrs is longer then allowed by 2623 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 2624 2625 bool 2626 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 2627 { 2628 vec<ddr_p> ddrs = 2629 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 2630 unsigned i, j; 2631 2632 if (dump_enabled_p ()) 2633 dump_printf_loc (MSG_NOTE, vect_location, 2634 "=== vect_prune_runtime_alias_test_list ==="); 2635 2636 for (i = 0; i < ddrs.length (); ) 2637 { 2638 bool found; 2639 ddr_p ddr_i; 2640 2641 ddr_i = ddrs[i]; 2642 found = false; 2643 2644 for (j = 0; j < i; j++) 2645 { 2646 ddr_p ddr_j = ddrs[j]; 2647 2648 if (vect_vfa_range_equal (ddr_i, ddr_j)) 2649 { 2650 if (dump_enabled_p ()) 2651 { 2652 dump_printf_loc (MSG_NOTE, vect_location, 2653 "found equal ranges "); 2654 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i))); 2655 dump_printf (MSG_NOTE, ", "); 2656 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i))); 2657 dump_printf (MSG_NOTE, " and "); 2658 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j))); 2659 dump_printf (MSG_NOTE, ", "); 2660 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j))); 2661 } 2662 found = true; 2663 break; 2664 } 2665 } 2666 2667 if (found) 2668 { 2669 ddrs.ordered_remove (i); 2670 continue; 2671 } 2672 i++; 2673 } 2674 2675 if (ddrs.length () > 2676 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) 2677 { 2678 if (dump_enabled_p ()) 2679 { 2680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2681 "disable versioning for alias - max number of " 2682 "generated checks exceeded."); 2683 } 2684 2685 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0); 2686 2687 return false; 2688 } 2689 2690 return true; 2691 } 2692 2693 /* Check whether a non-affine read in stmt is suitable for gather load 2694 and if so, return a builtin decl for that operation. */ 2695 2696 tree 2697 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep, 2698 tree *offp, int *scalep) 2699 { 2700 HOST_WIDE_INT scale = 1, pbitpos, pbitsize; 2701 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2702 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2703 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 2704 tree offtype = NULL_TREE; 2705 tree decl, base, off; 2706 enum machine_mode pmode; 2707 int punsignedp, pvolatilep; 2708 2709 /* The gather builtins need address of the form 2710 loop_invariant + vector * {1, 2, 4, 8} 2711 or 2712 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 2713 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 2714 of loop invariants/SSA_NAMEs defined in the loop, with casts, 2715 multiplications and additions in it. To get a vector, we need 2716 a single SSA_NAME that will be defined in the loop and will 2717 contain everything that is not loop invariant and that can be 2718 vectorized. The following code attempts to find such a preexistng 2719 SSA_NAME OFF and put the loop invariants into a tree BASE 2720 that can be gimplified before the loop. */ 2721 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off, 2722 &pmode, &punsignedp, &pvolatilep, false); 2723 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0); 2724 2725 if (TREE_CODE (base) == MEM_REF) 2726 { 2727 if (!integer_zerop (TREE_OPERAND (base, 1))) 2728 { 2729 if (off == NULL_TREE) 2730 { 2731 double_int moff = mem_ref_offset (base); 2732 off = double_int_to_tree (sizetype, moff); 2733 } 2734 else 2735 off = size_binop (PLUS_EXPR, off, 2736 fold_convert (sizetype, TREE_OPERAND (base, 1))); 2737 } 2738 base = TREE_OPERAND (base, 0); 2739 } 2740 else 2741 base = build_fold_addr_expr (base); 2742 2743 if (off == NULL_TREE) 2744 off = size_zero_node; 2745 2746 /* If base is not loop invariant, either off is 0, then we start with just 2747 the constant offset in the loop invariant BASE and continue with base 2748 as OFF, otherwise give up. 2749 We could handle that case by gimplifying the addition of base + off 2750 into some SSA_NAME and use that as off, but for now punt. */ 2751 if (!expr_invariant_in_loop_p (loop, base)) 2752 { 2753 if (!integer_zerop (off)) 2754 return NULL_TREE; 2755 off = base; 2756 base = size_int (pbitpos / BITS_PER_UNIT); 2757 } 2758 /* Otherwise put base + constant offset into the loop invariant BASE 2759 and continue with OFF. */ 2760 else 2761 { 2762 base = fold_convert (sizetype, base); 2763 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); 2764 } 2765 2766 /* OFF at this point may be either a SSA_NAME or some tree expression 2767 from get_inner_reference. Try to peel off loop invariants from it 2768 into BASE as long as possible. */ 2769 STRIP_NOPS (off); 2770 while (offtype == NULL_TREE) 2771 { 2772 enum tree_code code; 2773 tree op0, op1, add = NULL_TREE; 2774 2775 if (TREE_CODE (off) == SSA_NAME) 2776 { 2777 gimple def_stmt = SSA_NAME_DEF_STMT (off); 2778 2779 if (expr_invariant_in_loop_p (loop, off)) 2780 return NULL_TREE; 2781 2782 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 2783 break; 2784 2785 op0 = gimple_assign_rhs1 (def_stmt); 2786 code = gimple_assign_rhs_code (def_stmt); 2787 op1 = gimple_assign_rhs2 (def_stmt); 2788 } 2789 else 2790 { 2791 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 2792 return NULL_TREE; 2793 code = TREE_CODE (off); 2794 extract_ops_from_tree (off, &code, &op0, &op1); 2795 } 2796 switch (code) 2797 { 2798 case POINTER_PLUS_EXPR: 2799 case PLUS_EXPR: 2800 if (expr_invariant_in_loop_p (loop, op0)) 2801 { 2802 add = op0; 2803 off = op1; 2804 do_add: 2805 add = fold_convert (sizetype, add); 2806 if (scale != 1) 2807 add = size_binop (MULT_EXPR, add, size_int (scale)); 2808 base = size_binop (PLUS_EXPR, base, add); 2809 continue; 2810 } 2811 if (expr_invariant_in_loop_p (loop, op1)) 2812 { 2813 add = op1; 2814 off = op0; 2815 goto do_add; 2816 } 2817 break; 2818 case MINUS_EXPR: 2819 if (expr_invariant_in_loop_p (loop, op1)) 2820 { 2821 add = fold_convert (sizetype, op1); 2822 add = size_binop (MINUS_EXPR, size_zero_node, add); 2823 off = op0; 2824 goto do_add; 2825 } 2826 break; 2827 case MULT_EXPR: 2828 if (scale == 1 && host_integerp (op1, 0)) 2829 { 2830 scale = tree_low_cst (op1, 0); 2831 off = op0; 2832 continue; 2833 } 2834 break; 2835 case SSA_NAME: 2836 off = op0; 2837 continue; 2838 CASE_CONVERT: 2839 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 2840 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 2841 break; 2842 if (TYPE_PRECISION (TREE_TYPE (op0)) 2843 == TYPE_PRECISION (TREE_TYPE (off))) 2844 { 2845 off = op0; 2846 continue; 2847 } 2848 if (TYPE_PRECISION (TREE_TYPE (op0)) 2849 < TYPE_PRECISION (TREE_TYPE (off))) 2850 { 2851 off = op0; 2852 offtype = TREE_TYPE (off); 2853 STRIP_NOPS (off); 2854 continue; 2855 } 2856 break; 2857 default: 2858 break; 2859 } 2860 break; 2861 } 2862 2863 /* If at the end OFF still isn't a SSA_NAME or isn't 2864 defined in the loop, punt. */ 2865 if (TREE_CODE (off) != SSA_NAME 2866 || expr_invariant_in_loop_p (loop, off)) 2867 return NULL_TREE; 2868 2869 if (offtype == NULL_TREE) 2870 offtype = TREE_TYPE (off); 2871 2872 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), 2873 offtype, scale); 2874 if (decl == NULL_TREE) 2875 return NULL_TREE; 2876 2877 if (basep) 2878 *basep = base; 2879 if (offp) 2880 *offp = off; 2881 if (scalep) 2882 *scalep = scale; 2883 return decl; 2884 } 2885 2886 /* Check wether a non-affine load in STMT (being in the loop referred to 2887 in LOOP_VINFO) is suitable for handling as strided load. That is the case 2888 if its address is a simple induction variable. If so return the base 2889 of that induction variable in *BASEP and the (loop-invariant) step 2890 in *STEPP, both only when that pointer is non-zero. 2891 2892 This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant 2893 base pointer) only. */ 2894 2895 static bool 2896 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo) 2897 { 2898 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2899 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 2900 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 2901 tree base, off; 2902 affine_iv iv; 2903 2904 if (!DR_IS_READ (dr)) 2905 return false; 2906 2907 base = DR_REF (dr); 2908 2909 if (TREE_CODE (base) == ARRAY_REF) 2910 { 2911 off = TREE_OPERAND (base, 1); 2912 base = TREE_OPERAND (base, 0); 2913 } 2914 else if (TREE_CODE (base) == MEM_REF) 2915 { 2916 off = TREE_OPERAND (base, 0); 2917 base = TREE_OPERAND (base, 1); 2918 } 2919 else 2920 return false; 2921 2922 if (TREE_CODE (off) != SSA_NAME) 2923 return false; 2924 2925 if (!expr_invariant_in_loop_p (loop, base) 2926 || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true)) 2927 return false; 2928 2929 return true; 2930 } 2931 2932 /* Function vect_analyze_data_refs. 2933 2934 Find all the data references in the loop or basic block. 2935 2936 The general structure of the analysis of data refs in the vectorizer is as 2937 follows: 2938 1- vect_analyze_data_refs(loop/bb): call 2939 compute_data_dependences_for_loop/bb to find and analyze all data-refs 2940 in the loop/bb and their dependences. 2941 2- vect_analyze_dependences(): apply dependence testing using ddrs. 2942 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 2943 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 2944 2945 */ 2946 2947 bool 2948 vect_analyze_data_refs (loop_vec_info loop_vinfo, 2949 bb_vec_info bb_vinfo, 2950 int *min_vf) 2951 { 2952 struct loop *loop = NULL; 2953 basic_block bb = NULL; 2954 unsigned int i; 2955 vec<data_reference_p> datarefs; 2956 struct data_reference *dr; 2957 tree scalar_type; 2958 bool res, stop_bb_analysis = false; 2959 2960 if (dump_enabled_p ()) 2961 dump_printf_loc (MSG_NOTE, vect_location, 2962 "=== vect_analyze_data_refs ===\n"); 2963 2964 if (loop_vinfo) 2965 { 2966 loop = LOOP_VINFO_LOOP (loop_vinfo); 2967 res = compute_data_dependences_for_loop 2968 (loop, true, 2969 &LOOP_VINFO_LOOP_NEST (loop_vinfo), 2970 &LOOP_VINFO_DATAREFS (loop_vinfo), 2971 &LOOP_VINFO_DDRS (loop_vinfo)); 2972 2973 if (!res) 2974 { 2975 if (dump_enabled_p ()) 2976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2977 "not vectorized: loop contains function calls" 2978 " or data references that cannot be analyzed"); 2979 return false; 2980 } 2981 2982 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 2983 } 2984 else 2985 { 2986 gimple_stmt_iterator gsi; 2987 2988 bb = BB_VINFO_BB (bb_vinfo); 2989 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2990 { 2991 gimple stmt = gsi_stmt (gsi); 2992 if (!find_data_references_in_stmt (NULL, stmt, 2993 &BB_VINFO_DATAREFS (bb_vinfo))) 2994 { 2995 /* Mark the rest of the basic-block as unvectorizable. */ 2996 for (; !gsi_end_p (gsi); gsi_next (&gsi)) 2997 { 2998 stmt = gsi_stmt (gsi); 2999 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false; 3000 } 3001 break; 3002 } 3003 } 3004 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo), 3005 &BB_VINFO_DDRS (bb_vinfo), 3006 vNULL, true)) 3007 { 3008 if (dump_enabled_p ()) 3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3010 "not vectorized: basic block contains function" 3011 " calls or data references that cannot be" 3012 " analyzed"); 3013 return false; 3014 } 3015 3016 datarefs = BB_VINFO_DATAREFS (bb_vinfo); 3017 } 3018 3019 /* Go through the data-refs, check that the analysis succeeded. Update 3020 pointer from stmt_vec_info struct to DR and vectype. */ 3021 3022 FOR_EACH_VEC_ELT (datarefs, i, dr) 3023 { 3024 gimple stmt; 3025 stmt_vec_info stmt_info; 3026 tree base, offset, init; 3027 bool gather = false; 3028 int vf; 3029 3030 if (!dr || !DR_REF (dr)) 3031 { 3032 if (dump_enabled_p ()) 3033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3034 "not vectorized: unhandled data-ref "); 3035 return false; 3036 } 3037 3038 stmt = DR_STMT (dr); 3039 stmt_info = vinfo_for_stmt (stmt); 3040 3041 if (stop_bb_analysis) 3042 { 3043 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3044 continue; 3045 } 3046 3047 /* Check that analysis of the data-ref succeeded. */ 3048 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 3049 || !DR_STEP (dr)) 3050 { 3051 /* If target supports vector gather loads, see if they can't 3052 be used. */ 3053 if (loop_vinfo 3054 && DR_IS_READ (dr) 3055 && !TREE_THIS_VOLATILE (DR_REF (dr)) 3056 && targetm.vectorize.builtin_gather != NULL 3057 && !nested_in_vect_loop_p (loop, stmt)) 3058 { 3059 struct data_reference *newdr 3060 = create_data_ref (NULL, loop_containing_stmt (stmt), 3061 DR_REF (dr), stmt, true); 3062 gcc_assert (newdr != NULL && DR_REF (newdr)); 3063 if (DR_BASE_ADDRESS (newdr) 3064 && DR_OFFSET (newdr) 3065 && DR_INIT (newdr) 3066 && DR_STEP (newdr) 3067 && integer_zerop (DR_STEP (newdr))) 3068 { 3069 dr = newdr; 3070 gather = true; 3071 } 3072 else 3073 free_data_ref (newdr); 3074 } 3075 3076 if (!gather) 3077 { 3078 if (dump_enabled_p ()) 3079 { 3080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3081 "not vectorized: data ref analysis " 3082 "failed "); 3083 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3084 } 3085 3086 if (bb_vinfo) 3087 { 3088 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3089 stop_bb_analysis = true; 3090 continue; 3091 } 3092 3093 return false; 3094 } 3095 } 3096 3097 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 3098 { 3099 if (dump_enabled_p ()) 3100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3101 "not vectorized: base addr of dr is a " 3102 "constant"); 3103 3104 if (bb_vinfo) 3105 { 3106 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3107 stop_bb_analysis = true; 3108 continue; 3109 } 3110 3111 if (gather) 3112 free_data_ref (dr); 3113 return false; 3114 } 3115 3116 if (TREE_THIS_VOLATILE (DR_REF (dr))) 3117 { 3118 if (dump_enabled_p ()) 3119 { 3120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3121 "not vectorized: volatile type "); 3122 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3123 } 3124 3125 if (bb_vinfo) 3126 { 3127 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3128 stop_bb_analysis = true; 3129 continue; 3130 } 3131 3132 return false; 3133 } 3134 3135 if (stmt_can_throw_internal (stmt)) 3136 { 3137 if (dump_enabled_p ()) 3138 { 3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3140 "not vectorized: statement can throw an " 3141 "exception "); 3142 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3143 } 3144 3145 if (bb_vinfo) 3146 { 3147 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3148 stop_bb_analysis = true; 3149 continue; 3150 } 3151 3152 if (gather) 3153 free_data_ref (dr); 3154 return false; 3155 } 3156 3157 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 3158 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 3159 { 3160 if (dump_enabled_p ()) 3161 { 3162 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3163 "not vectorized: statement is bitfield " 3164 "access "); 3165 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3166 } 3167 3168 if (bb_vinfo) 3169 { 3170 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3171 stop_bb_analysis = true; 3172 continue; 3173 } 3174 3175 if (gather) 3176 free_data_ref (dr); 3177 return false; 3178 } 3179 3180 base = unshare_expr (DR_BASE_ADDRESS (dr)); 3181 offset = unshare_expr (DR_OFFSET (dr)); 3182 init = unshare_expr (DR_INIT (dr)); 3183 3184 if (is_gimple_call (stmt)) 3185 { 3186 if (dump_enabled_p ()) 3187 { 3188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3189 "not vectorized: dr in a call "); 3190 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3191 } 3192 3193 if (bb_vinfo) 3194 { 3195 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3196 stop_bb_analysis = true; 3197 continue; 3198 } 3199 3200 if (gather) 3201 free_data_ref (dr); 3202 return false; 3203 } 3204 3205 /* Update DR field in stmt_vec_info struct. */ 3206 3207 /* If the dataref is in an inner-loop of the loop that is considered for 3208 for vectorization, we also want to analyze the access relative to 3209 the outer-loop (DR contains information only relative to the 3210 inner-most enclosing loop). We do that by building a reference to the 3211 first location accessed by the inner-loop, and analyze it relative to 3212 the outer-loop. */ 3213 if (loop && nested_in_vect_loop_p (loop, stmt)) 3214 { 3215 tree outer_step, outer_base, outer_init; 3216 HOST_WIDE_INT pbitsize, pbitpos; 3217 tree poffset; 3218 enum machine_mode pmode; 3219 int punsignedp, pvolatilep; 3220 affine_iv base_iv, offset_iv; 3221 tree dinit; 3222 3223 /* Build a reference to the first location accessed by the 3224 inner-loop: *(BASE+INIT). (The first location is actually 3225 BASE+INIT+OFFSET, but we add OFFSET separately later). */ 3226 tree inner_base = build_fold_indirect_ref 3227 (fold_build_pointer_plus (base, init)); 3228 3229 if (dump_enabled_p ()) 3230 { 3231 dump_printf_loc (MSG_NOTE, vect_location, 3232 "analyze in outer-loop: "); 3233 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); 3234 } 3235 3236 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, 3237 &poffset, &pmode, &punsignedp, &pvolatilep, false); 3238 gcc_assert (outer_base != NULL_TREE); 3239 3240 if (pbitpos % BITS_PER_UNIT != 0) 3241 { 3242 if (dump_enabled_p ()) 3243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3244 "failed: bit offset alignment.\n"); 3245 return false; 3246 } 3247 3248 outer_base = build_fold_addr_expr (outer_base); 3249 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, 3250 &base_iv, false)) 3251 { 3252 if (dump_enabled_p ()) 3253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3254 "failed: evolution of base is not affine.\n"); 3255 return false; 3256 } 3257 3258 if (offset) 3259 { 3260 if (poffset) 3261 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, 3262 poffset); 3263 else 3264 poffset = offset; 3265 } 3266 3267 if (!poffset) 3268 { 3269 offset_iv.base = ssize_int (0); 3270 offset_iv.step = ssize_int (0); 3271 } 3272 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, 3273 &offset_iv, false)) 3274 { 3275 if (dump_enabled_p ()) 3276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3277 "evolution of offset is not affine.\n"); 3278 return false; 3279 } 3280 3281 outer_init = ssize_int (pbitpos / BITS_PER_UNIT); 3282 split_constant_offset (base_iv.base, &base_iv.base, &dinit); 3283 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3284 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); 3285 outer_init = size_binop (PLUS_EXPR, outer_init, dinit); 3286 3287 outer_step = size_binop (PLUS_EXPR, 3288 fold_convert (ssizetype, base_iv.step), 3289 fold_convert (ssizetype, offset_iv.step)); 3290 3291 STMT_VINFO_DR_STEP (stmt_info) = outer_step; 3292 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ 3293 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; 3294 STMT_VINFO_DR_INIT (stmt_info) = outer_init; 3295 STMT_VINFO_DR_OFFSET (stmt_info) = 3296 fold_convert (ssizetype, offset_iv.base); 3297 STMT_VINFO_DR_ALIGNED_TO (stmt_info) = 3298 size_int (highest_pow2_factor (offset_iv.base)); 3299 3300 if (dump_enabled_p ()) 3301 { 3302 dump_printf_loc (MSG_NOTE, vect_location, 3303 "\touter base_address: "); 3304 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3305 STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3306 dump_printf (MSG_NOTE, "\n\touter offset from base address: "); 3307 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3308 STMT_VINFO_DR_OFFSET (stmt_info)); 3309 dump_printf (MSG_NOTE, 3310 "\n\touter constant offset from base address: "); 3311 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3312 STMT_VINFO_DR_INIT (stmt_info)); 3313 dump_printf (MSG_NOTE, "\n\touter step: "); 3314 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3315 STMT_VINFO_DR_STEP (stmt_info)); 3316 dump_printf (MSG_NOTE, "\n\touter aligned to: "); 3317 dump_generic_expr (MSG_NOTE, TDF_SLIM, 3318 STMT_VINFO_DR_ALIGNED_TO (stmt_info)); 3319 } 3320 } 3321 3322 if (STMT_VINFO_DATA_REF (stmt_info)) 3323 { 3324 if (dump_enabled_p ()) 3325 { 3326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3327 "not vectorized: more than one data ref " 3328 "in stmt: "); 3329 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3330 } 3331 3332 if (bb_vinfo) 3333 { 3334 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3335 stop_bb_analysis = true; 3336 continue; 3337 } 3338 3339 if (gather) 3340 free_data_ref (dr); 3341 return false; 3342 } 3343 3344 STMT_VINFO_DATA_REF (stmt_info) = dr; 3345 3346 /* Set vectype for STMT. */ 3347 scalar_type = TREE_TYPE (DR_REF (dr)); 3348 STMT_VINFO_VECTYPE (stmt_info) = 3349 get_vectype_for_scalar_type (scalar_type); 3350 if (!STMT_VINFO_VECTYPE (stmt_info)) 3351 { 3352 if (dump_enabled_p ()) 3353 { 3354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3355 "not vectorized: no vectype for stmt: "); 3356 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3357 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 3358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 3359 scalar_type); 3360 } 3361 3362 if (bb_vinfo) 3363 { 3364 /* Mark the statement as not vectorizable. */ 3365 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 3366 stop_bb_analysis = true; 3367 continue; 3368 } 3369 3370 if (gather) 3371 { 3372 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3373 free_data_ref (dr); 3374 } 3375 return false; 3376 } 3377 3378 /* Adjust the minimal vectorization factor according to the 3379 vector type. */ 3380 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 3381 if (vf > *min_vf) 3382 *min_vf = vf; 3383 3384 if (gather) 3385 { 3386 unsigned int j, k, n; 3387 struct data_reference *olddr 3388 = datarefs[i]; 3389 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo); 3390 struct data_dependence_relation *ddr, *newddr; 3391 bool bad = false; 3392 tree off; 3393 vec<loop_p> nest = LOOP_VINFO_LOOP_NEST (loop_vinfo); 3394 3395 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL); 3396 if (gather 3397 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) 3398 gather = false; 3399 if (!gather) 3400 { 3401 STMT_VINFO_DATA_REF (stmt_info) = NULL; 3402 free_data_ref (dr); 3403 if (dump_enabled_p ()) 3404 { 3405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3406 "not vectorized: not suitable for gather " 3407 "load "); 3408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3409 } 3410 return false; 3411 } 3412 3413 n = datarefs.length () - 1; 3414 for (j = 0, k = i - 1; j < i; j++) 3415 { 3416 ddr = ddrs[k]; 3417 gcc_assert (DDR_B (ddr) == olddr); 3418 newddr = initialize_data_dependence_relation (DDR_A (ddr), dr, 3419 nest); 3420 ddrs[k] = newddr; 3421 free_dependence_relation (ddr); 3422 if (!bad 3423 && DR_IS_WRITE (DDR_A (newddr)) 3424 && DDR_ARE_DEPENDENT (newddr) != chrec_known) 3425 bad = true; 3426 k += --n; 3427 } 3428 3429 k++; 3430 n = k + datarefs.length () - i - 1; 3431 for (; k < n; k++) 3432 { 3433 ddr = ddrs[k]; 3434 gcc_assert (DDR_A (ddr) == olddr); 3435 newddr = initialize_data_dependence_relation (dr, DDR_B (ddr), 3436 nest); 3437 ddrs[k] = newddr; 3438 free_dependence_relation (ddr); 3439 if (!bad 3440 && DR_IS_WRITE (DDR_B (newddr)) 3441 && DDR_ARE_DEPENDENT (newddr) != chrec_known) 3442 bad = true; 3443 } 3444 3445 k = ddrs.length () 3446 - datarefs.length () + i; 3447 ddr = ddrs[k]; 3448 gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr); 3449 newddr = initialize_data_dependence_relation (dr, dr, nest); 3450 ddrs[k] = newddr; 3451 free_dependence_relation (ddr); 3452 datarefs[i] = dr; 3453 3454 if (bad) 3455 { 3456 if (dump_enabled_p ()) 3457 { 3458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3459 "not vectorized: data dependence conflict" 3460 " prevents gather load"); 3461 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3462 } 3463 return false; 3464 } 3465 3466 STMT_VINFO_GATHER_P (stmt_info) = true; 3467 } 3468 else if (loop_vinfo 3469 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 3470 { 3471 bool strided_load = false; 3472 if (!nested_in_vect_loop_p (loop, stmt)) 3473 strided_load = vect_check_strided_load (stmt, loop_vinfo); 3474 if (!strided_load) 3475 { 3476 if (dump_enabled_p ()) 3477 { 3478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3479 "not vectorized: not suitable for strided " 3480 "load "); 3481 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); 3482 } 3483 return false; 3484 } 3485 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true; 3486 } 3487 } 3488 3489 return true; 3490 } 3491 3492 3493 /* Function vect_get_new_vect_var. 3494 3495 Returns a name for a new variable. The current naming scheme appends the 3496 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 3497 the name of vectorizer generated variables, and appends that to NAME if 3498 provided. */ 3499 3500 tree 3501 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 3502 { 3503 const char *prefix; 3504 tree new_vect_var; 3505 3506 switch (var_kind) 3507 { 3508 case vect_simple_var: 3509 prefix = "vect_"; 3510 break; 3511 case vect_scalar_var: 3512 prefix = "stmp_"; 3513 break; 3514 case vect_pointer_var: 3515 prefix = "vect_p"; 3516 break; 3517 default: 3518 gcc_unreachable (); 3519 } 3520 3521 if (name) 3522 { 3523 char* tmp = concat (prefix, name, NULL); 3524 new_vect_var = create_tmp_reg (type, tmp); 3525 free (tmp); 3526 } 3527 else 3528 new_vect_var = create_tmp_reg (type, prefix); 3529 3530 return new_vect_var; 3531 } 3532 3533 3534 /* Function vect_create_addr_base_for_vector_ref. 3535 3536 Create an expression that computes the address of the first memory location 3537 that will be accessed for a data reference. 3538 3539 Input: 3540 STMT: The statement containing the data reference. 3541 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 3542 OFFSET: Optional. If supplied, it is be added to the initial address. 3543 LOOP: Specify relative to which loop-nest should the address be computed. 3544 For example, when the dataref is in an inner-loop nested in an 3545 outer-loop that is now being vectorized, LOOP can be either the 3546 outer-loop, or the inner-loop. The first memory location accessed 3547 by the following dataref ('in' points to short): 3548 3549 for (i=0; i<N; i++) 3550 for (j=0; j<M; j++) 3551 s += in[i+j] 3552 3553 is as follows: 3554 if LOOP=i_loop: &in (relative to i_loop) 3555 if LOOP=j_loop: &in+i*2B (relative to j_loop) 3556 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 3557 initial address. Unlike OFFSET, which is number of elements to 3558 be added, BYTE_OFFSET is measured in bytes. 3559 3560 Output: 3561 1. Return an SSA_NAME whose value is the address of the memory location of 3562 the first vector of the data reference. 3563 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 3564 these statement(s) which define the returned SSA_NAME. 3565 3566 FORNOW: We are only handling array accesses with step 1. */ 3567 3568 tree 3569 vect_create_addr_base_for_vector_ref (gimple stmt, 3570 gimple_seq *new_stmt_list, 3571 tree offset, 3572 struct loop *loop, 3573 tree byte_offset) 3574 { 3575 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3576 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3577 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); 3578 const char *base_name; 3579 tree data_ref_base_var; 3580 tree vec_stmt; 3581 tree addr_base, addr_expr; 3582 tree dest; 3583 gimple_seq seq = NULL; 3584 tree base_offset = unshare_expr (DR_OFFSET (dr)); 3585 tree init = unshare_expr (DR_INIT (dr)); 3586 tree vect_ptr_type; 3587 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 3588 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3589 tree base; 3590 3591 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) 3592 { 3593 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); 3594 3595 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); 3596 3597 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); 3598 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); 3599 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); 3600 } 3601 3602 if (loop_vinfo) 3603 base_name = get_name (data_ref_base); 3604 else 3605 { 3606 base_offset = ssize_int (0); 3607 init = ssize_int (0); 3608 base_name = get_name (DR_REF (dr)); 3609 } 3610 3611 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); 3612 data_ref_base = force_gimple_operand (data_ref_base, &seq, true, 3613 data_ref_base_var); 3614 gimple_seq_add_seq (new_stmt_list, seq); 3615 3616 /* Create base_offset */ 3617 base_offset = size_binop (PLUS_EXPR, 3618 fold_convert (sizetype, base_offset), 3619 fold_convert (sizetype, init)); 3620 dest = create_tmp_var (sizetype, "base_off"); 3621 base_offset = force_gimple_operand (base_offset, &seq, true, dest); 3622 gimple_seq_add_seq (new_stmt_list, seq); 3623 3624 if (offset) 3625 { 3626 tree tmp = create_tmp_var (sizetype, "offset"); 3627 3628 offset = fold_build2 (MULT_EXPR, sizetype, 3629 fold_convert (sizetype, offset), step); 3630 base_offset = fold_build2 (PLUS_EXPR, sizetype, 3631 base_offset, offset); 3632 base_offset = force_gimple_operand (base_offset, &seq, false, tmp); 3633 gimple_seq_add_seq (new_stmt_list, seq); 3634 } 3635 if (byte_offset) 3636 { 3637 tree tmp = create_tmp_var (sizetype, "offset"); 3638 3639 byte_offset = fold_convert (sizetype, byte_offset); 3640 base_offset = fold_build2 (PLUS_EXPR, sizetype, 3641 base_offset, byte_offset); 3642 base_offset = force_gimple_operand (base_offset, &seq, false, tmp); 3643 gimple_seq_add_seq (new_stmt_list, seq); 3644 } 3645 3646 /* base + base_offset */ 3647 if (loop_vinfo) 3648 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 3649 else 3650 { 3651 addr_base = build1 (ADDR_EXPR, 3652 build_pointer_type (TREE_TYPE (DR_REF (dr))), 3653 unshare_expr (DR_REF (dr))); 3654 } 3655 3656 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 3657 base = get_base_address (DR_REF (dr)); 3658 if (base 3659 && TREE_CODE (base) == MEM_REF) 3660 vect_ptr_type 3661 = build_qualified_type (vect_ptr_type, 3662 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0)))); 3663 3664 vec_stmt = fold_convert (vect_ptr_type, addr_base); 3665 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, 3666 base_name); 3667 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr); 3668 gimple_seq_add_seq (new_stmt_list, seq); 3669 3670 if (DR_PTR_INFO (dr) 3671 && TREE_CODE (vec_stmt) == SSA_NAME) 3672 { 3673 duplicate_ssa_name_ptr_info (vec_stmt, DR_PTR_INFO (dr)); 3674 if (offset) 3675 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (vec_stmt)); 3676 } 3677 3678 if (dump_enabled_p ()) 3679 { 3680 dump_printf_loc (MSG_NOTE, vect_location, "created "); 3681 dump_generic_expr (MSG_NOTE, TDF_SLIM, vec_stmt); 3682 } 3683 3684 return vec_stmt; 3685 } 3686 3687 3688 /* Function vect_create_data_ref_ptr. 3689 3690 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 3691 location accessed in the loop by STMT, along with the def-use update 3692 chain to appropriately advance the pointer through the loop iterations. 3693 Also set aliasing information for the pointer. This pointer is used by 3694 the callers to this function to create a memory reference expression for 3695 vector load/store access. 3696 3697 Input: 3698 1. STMT: a stmt that references memory. Expected to be of the form 3699 GIMPLE_ASSIGN <name, data-ref> or 3700 GIMPLE_ASSIGN <data-ref, name>. 3701 2. AGGR_TYPE: the type of the reference, which should be either a vector 3702 or an array. 3703 3. AT_LOOP: the loop where the vector memref is to be created. 3704 4. OFFSET (optional): an offset to be added to the initial address accessed 3705 by the data-ref in STMT. 3706 5. BSI: location where the new stmts are to be placed if there is no loop 3707 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 3708 pointing to the initial address. 3709 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 3710 to the initial address accessed by the data-ref in STMT. This is 3711 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 3712 in bytes. 3713 3714 Output: 3715 1. Declare a new ptr to vector_type, and have it point to the base of the 3716 data reference (initial addressed accessed by the data reference). 3717 For example, for vector of type V8HI, the following code is generated: 3718 3719 v8hi *ap; 3720 ap = (v8hi *)initial_address; 3721 3722 if OFFSET is not supplied: 3723 initial_address = &a[init]; 3724 if OFFSET is supplied: 3725 initial_address = &a[init + OFFSET]; 3726 if BYTE_OFFSET is supplied: 3727 initial_address = &a[init] + BYTE_OFFSET; 3728 3729 Return the initial_address in INITIAL_ADDRESS. 3730 3731 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 3732 update the pointer in each iteration of the loop. 3733 3734 Return the increment stmt that updates the pointer in PTR_INCR. 3735 3736 3. Set INV_P to true if the access pattern of the data reference in the 3737 vectorized loop is invariant. Set it to false otherwise. 3738 3739 4. Return the pointer. */ 3740 3741 tree 3742 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop, 3743 tree offset, tree *initial_address, 3744 gimple_stmt_iterator *gsi, gimple *ptr_incr, 3745 bool only_init, bool *inv_p, tree byte_offset) 3746 { 3747 const char *base_name; 3748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 3749 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3750 struct loop *loop = NULL; 3751 bool nested_in_vect_loop = false; 3752 struct loop *containing_loop = NULL; 3753 tree aggr_ptr_type; 3754 tree aggr_ptr; 3755 tree new_temp; 3756 gimple vec_stmt; 3757 gimple_seq new_stmt_list = NULL; 3758 edge pe = NULL; 3759 basic_block new_bb; 3760 tree aggr_ptr_init; 3761 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3762 tree aptr; 3763 gimple_stmt_iterator incr_gsi; 3764 bool insert_after; 3765 bool negative; 3766 tree indx_before_incr, indx_after_incr; 3767 gimple incr; 3768 tree step; 3769 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 3770 tree base; 3771 3772 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE 3773 || TREE_CODE (aggr_type) == VECTOR_TYPE); 3774 3775 if (loop_vinfo) 3776 { 3777 loop = LOOP_VINFO_LOOP (loop_vinfo); 3778 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 3779 containing_loop = (gimple_bb (stmt))->loop_father; 3780 pe = loop_preheader_edge (loop); 3781 } 3782 else 3783 { 3784 gcc_assert (bb_vinfo); 3785 only_init = true; 3786 *ptr_incr = NULL; 3787 } 3788 3789 /* Check the step (evolution) of the load in LOOP, and record 3790 whether it's invariant. */ 3791 if (nested_in_vect_loop) 3792 step = STMT_VINFO_DR_STEP (stmt_info); 3793 else 3794 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); 3795 3796 if (tree_int_cst_compare (step, size_zero_node) == 0) 3797 *inv_p = true; 3798 else 3799 *inv_p = false; 3800 negative = tree_int_cst_compare (step, size_zero_node) < 0; 3801 3802 /* Create an expression for the first address accessed by this load 3803 in LOOP. */ 3804 base_name = get_name (DR_BASE_ADDRESS (dr)); 3805 3806 if (dump_enabled_p ()) 3807 { 3808 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 3809 dump_printf_loc (MSG_NOTE, vect_location, 3810 "create %s-pointer variable to type: ", 3811 tree_code_name[(int) TREE_CODE (aggr_type)]); 3812 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); 3813 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 3814 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 3815 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 3816 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 3817 else 3818 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 3819 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); 3820 } 3821 3822 /* (1) Create the new aggregate-pointer variable. */ 3823 aggr_ptr_type = build_pointer_type (aggr_type); 3824 base = get_base_address (DR_REF (dr)); 3825 if (base 3826 && TREE_CODE (base) == MEM_REF) 3827 aggr_ptr_type 3828 = build_qualified_type (aggr_ptr_type, 3829 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0)))); 3830 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 3831 3832 /* Vector and array types inherit the alias set of their component 3833 type by default so we need to use a ref-all pointer if the data 3834 reference does not conflict with the created aggregated data 3835 reference because it is not addressable. */ 3836 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr), 3837 get_alias_set (DR_REF (dr)))) 3838 { 3839 aggr_ptr_type 3840 = build_pointer_type_for_mode (aggr_type, 3841 TYPE_MODE (aggr_ptr_type), true); 3842 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, 3843 base_name); 3844 } 3845 3846 /* Likewise for any of the data references in the stmt group. */ 3847 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) 3848 { 3849 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); 3850 do 3851 { 3852 tree lhs = gimple_assign_lhs (orig_stmt); 3853 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr), 3854 get_alias_set (lhs))) 3855 { 3856 aggr_ptr_type 3857 = build_pointer_type_for_mode (aggr_type, 3858 TYPE_MODE (aggr_ptr_type), true); 3859 aggr_ptr 3860 = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, 3861 base_name); 3862 break; 3863 } 3864 3865 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo_for_stmt (orig_stmt)); 3866 } 3867 while (orig_stmt); 3868 } 3869 3870 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 3871 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 3872 def-use update cycles for the pointer: one relative to the outer-loop 3873 (LOOP), which is what steps (3) and (4) below do. The other is relative 3874 to the inner-loop (which is the inner-most loop containing the dataref), 3875 and this is done be step (5) below. 3876 3877 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 3878 inner-most loop, and so steps (3),(4) work the same, and step (5) is 3879 redundant. Steps (3),(4) create the following: 3880 3881 vp0 = &base_addr; 3882 LOOP: vp1 = phi(vp0,vp2) 3883 ... 3884 ... 3885 vp2 = vp1 + step 3886 goto LOOP 3887 3888 If there is an inner-loop nested in loop, then step (5) will also be 3889 applied, and an additional update in the inner-loop will be created: 3890 3891 vp0 = &base_addr; 3892 LOOP: vp1 = phi(vp0,vp2) 3893 ... 3894 inner: vp3 = phi(vp1,vp4) 3895 vp4 = vp3 + inner_step 3896 if () goto inner 3897 ... 3898 vp2 = vp1 + step 3899 if () goto LOOP */ 3900 3901 /* (2) Calculate the initial address of the aggregate-pointer, and set 3902 the aggregate-pointer to point to it before the loop. */ 3903 3904 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 3905 3906 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, 3907 offset, loop, byte_offset); 3908 if (new_stmt_list) 3909 { 3910 if (pe) 3911 { 3912 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 3913 gcc_assert (!new_bb); 3914 } 3915 else 3916 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 3917 } 3918 3919 *initial_address = new_temp; 3920 3921 /* Create: p = (aggr_type *) initial_base */ 3922 if (TREE_CODE (new_temp) != SSA_NAME 3923 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp))) 3924 { 3925 vec_stmt = gimple_build_assign (aggr_ptr, 3926 fold_convert (aggr_ptr_type, new_temp)); 3927 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt); 3928 /* Copy the points-to information if it exists. */ 3929 if (DR_PTR_INFO (dr)) 3930 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr)); 3931 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init); 3932 if (pe) 3933 { 3934 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt); 3935 gcc_assert (!new_bb); 3936 } 3937 else 3938 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT); 3939 } 3940 else 3941 aggr_ptr_init = new_temp; 3942 3943 /* (3) Handle the updating of the aggregate-pointer inside the loop. 3944 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 3945 inner-loop nested in LOOP (during outer-loop vectorization). */ 3946 3947 /* No update in loop is required. */ 3948 if (only_init && (!loop_vinfo || at_loop == loop)) 3949 aptr = aggr_ptr_init; 3950 else 3951 { 3952 /* The step of the aggregate pointer is the type size. */ 3953 tree step = TYPE_SIZE_UNIT (aggr_type); 3954 /* One exception to the above is when the scalar step of the load in 3955 LOOP is zero. In this case the step here is also zero. */ 3956 if (*inv_p) 3957 step = size_zero_node; 3958 else if (negative) 3959 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step); 3960 3961 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 3962 3963 create_iv (aggr_ptr_init, 3964 fold_convert (aggr_ptr_type, step), 3965 aggr_ptr, loop, &incr_gsi, insert_after, 3966 &indx_before_incr, &indx_after_incr); 3967 incr = gsi_stmt (incr_gsi); 3968 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); 3969 3970 /* Copy the points-to information if it exists. */ 3971 if (DR_PTR_INFO (dr)) 3972 { 3973 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); 3974 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); 3975 } 3976 if (ptr_incr) 3977 *ptr_incr = incr; 3978 3979 aptr = indx_before_incr; 3980 } 3981 3982 if (!nested_in_vect_loop || only_init) 3983 return aptr; 3984 3985 3986 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 3987 nested in LOOP, if exists. */ 3988 3989 gcc_assert (nested_in_vect_loop); 3990 if (!only_init) 3991 { 3992 standard_iv_increment_position (containing_loop, &incr_gsi, 3993 &insert_after); 3994 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 3995 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 3996 &indx_after_incr); 3997 incr = gsi_stmt (incr_gsi); 3998 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); 3999 4000 /* Copy the points-to information if it exists. */ 4001 if (DR_PTR_INFO (dr)) 4002 { 4003 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); 4004 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); 4005 } 4006 if (ptr_incr) 4007 *ptr_incr = incr; 4008 4009 return indx_before_incr; 4010 } 4011 else 4012 gcc_unreachable (); 4013 } 4014 4015 4016 /* Function bump_vector_ptr 4017 4018 Increment a pointer (to a vector type) by vector-size. If requested, 4019 i.e. if PTR-INCR is given, then also connect the new increment stmt 4020 to the existing def-use update-chain of the pointer, by modifying 4021 the PTR_INCR as illustrated below: 4022 4023 The pointer def-use update-chain before this function: 4024 DATAREF_PTR = phi (p_0, p_2) 4025 .... 4026 PTR_INCR: p_2 = DATAREF_PTR + step 4027 4028 The pointer def-use update-chain after this function: 4029 DATAREF_PTR = phi (p_0, p_2) 4030 .... 4031 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4032 .... 4033 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4034 4035 Input: 4036 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4037 in the loop. 4038 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4039 the loop. The increment amount across iterations is expected 4040 to be vector_size. 4041 BSI - location where the new update stmt is to be placed. 4042 STMT - the original scalar memory-access stmt that is being vectorized. 4043 BUMP - optional. The offset by which to bump the pointer. If not given, 4044 the offset is assumed to be vector_size. 4045 4046 Output: Return NEW_DATAREF_PTR as illustrated above. 4047 4048 */ 4049 4050 tree 4051 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi, 4052 gimple stmt, tree bump) 4053 { 4054 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4055 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4056 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4057 tree update = TYPE_SIZE_UNIT (vectype); 4058 gimple incr_stmt; 4059 ssa_op_iter iter; 4060 use_operand_p use_p; 4061 tree new_dataref_ptr; 4062 4063 if (bump) 4064 update = bump; 4065 4066 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL); 4067 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr, 4068 dataref_ptr, update); 4069 vect_finish_stmt_generation (stmt, incr_stmt, gsi); 4070 4071 /* Copy the points-to information if it exists. */ 4072 if (DR_PTR_INFO (dr)) 4073 { 4074 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 4075 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 4076 } 4077 4078 if (!ptr_incr) 4079 return new_dataref_ptr; 4080 4081 /* Update the vector-pointer's cross-iteration increment. */ 4082 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 4083 { 4084 tree use = USE_FROM_PTR (use_p); 4085 4086 if (use == dataref_ptr) 4087 SET_USE (use_p, new_dataref_ptr); 4088 else 4089 gcc_assert (tree_int_cst_compare (use, update) == 0); 4090 } 4091 4092 return new_dataref_ptr; 4093 } 4094 4095 4096 /* Function vect_create_destination_var. 4097 4098 Create a new temporary of type VECTYPE. */ 4099 4100 tree 4101 vect_create_destination_var (tree scalar_dest, tree vectype) 4102 { 4103 tree vec_dest; 4104 const char *new_name; 4105 tree type; 4106 enum vect_var_kind kind; 4107 4108 kind = vectype ? vect_simple_var : vect_scalar_var; 4109 type = vectype ? vectype : TREE_TYPE (scalar_dest); 4110 4111 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 4112 4113 new_name = get_name (scalar_dest); 4114 if (!new_name) 4115 new_name = "var_"; 4116 vec_dest = vect_get_new_vect_var (type, kind, new_name); 4117 4118 return vec_dest; 4119 } 4120 4121 /* Function vect_grouped_store_supported. 4122 4123 Returns TRUE if interleave high and interleave low permutations 4124 are supported, and FALSE otherwise. */ 4125 4126 bool 4127 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 4128 { 4129 enum machine_mode mode = TYPE_MODE (vectype); 4130 4131 /* vect_permute_store_chain requires the group size to be a power of two. */ 4132 if (exact_log2 (count) == -1) 4133 { 4134 if (dump_enabled_p ()) 4135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4136 "the size of the group of accesses" 4137 " is not a power of 2"); 4138 return false; 4139 } 4140 4141 /* Check that the permutation is supported. */ 4142 if (VECTOR_MODE_P (mode)) 4143 { 4144 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4145 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4146 for (i = 0; i < nelt / 2; i++) 4147 { 4148 sel[i * 2] = i; 4149 sel[i * 2 + 1] = i + nelt; 4150 } 4151 if (can_vec_perm_p (mode, false, sel)) 4152 { 4153 for (i = 0; i < nelt; i++) 4154 sel[i] += nelt / 2; 4155 if (can_vec_perm_p (mode, false, sel)) 4156 return true; 4157 } 4158 } 4159 4160 if (dump_enabled_p ()) 4161 dump_printf (MSG_MISSED_OPTIMIZATION, 4162 "interleave op not supported by target."); 4163 return false; 4164 } 4165 4166 4167 /* Return TRUE if vec_store_lanes is available for COUNT vectors of 4168 type VECTYPE. */ 4169 4170 bool 4171 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4172 { 4173 return vect_lanes_optab_supported_p ("vec_store_lanes", 4174 vec_store_lanes_optab, 4175 vectype, count); 4176 } 4177 4178 4179 /* Function vect_permute_store_chain. 4180 4181 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 4182 a power of 2, generate interleave_high/low stmts to reorder the data 4183 correctly for the stores. Return the final references for stores in 4184 RESULT_CHAIN. 4185 4186 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4187 The input is 4 vectors each containing 8 elements. We assign a number to 4188 each element, the input sequence is: 4189 4190 1st vec: 0 1 2 3 4 5 6 7 4191 2nd vec: 8 9 10 11 12 13 14 15 4192 3rd vec: 16 17 18 19 20 21 22 23 4193 4th vec: 24 25 26 27 28 29 30 31 4194 4195 The output sequence should be: 4196 4197 1st vec: 0 8 16 24 1 9 17 25 4198 2nd vec: 2 10 18 26 3 11 19 27 4199 3rd vec: 4 12 20 28 5 13 21 30 4200 4th vec: 6 14 22 30 7 15 23 31 4201 4202 i.e., we interleave the contents of the four vectors in their order. 4203 4204 We use interleave_high/low instructions to create such output. The input of 4205 each interleave_high/low operation is two vectors: 4206 1st vec 2nd vec 4207 0 1 2 3 4 5 6 7 4208 the even elements of the result vector are obtained left-to-right from the 4209 high/low elements of the first vector. The odd elements of the result are 4210 obtained left-to-right from the high/low elements of the second vector. 4211 The output of interleave_high will be: 0 4 1 5 4212 and of interleave_low: 2 6 3 7 4213 4214 4215 The permutation is done in log LENGTH stages. In each stage interleave_high 4216 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 4217 where the first argument is taken from the first half of DR_CHAIN and the 4218 second argument from it's second half. 4219 In our example, 4220 4221 I1: interleave_high (1st vec, 3rd vec) 4222 I2: interleave_low (1st vec, 3rd vec) 4223 I3: interleave_high (2nd vec, 4th vec) 4224 I4: interleave_low (2nd vec, 4th vec) 4225 4226 The output for the first stage is: 4227 4228 I1: 0 16 1 17 2 18 3 19 4229 I2: 4 20 5 21 6 22 7 23 4230 I3: 8 24 9 25 10 26 11 27 4231 I4: 12 28 13 29 14 30 15 31 4232 4233 The output of the second stage, i.e. the final result is: 4234 4235 I1: 0 8 16 24 1 9 17 25 4236 I2: 2 10 18 26 3 11 19 27 4237 I3: 4 12 20 28 5 13 21 30 4238 I4: 6 14 22 30 7 15 23 31. */ 4239 4240 void 4241 vect_permute_store_chain (vec<tree> dr_chain, 4242 unsigned int length, 4243 gimple stmt, 4244 gimple_stmt_iterator *gsi, 4245 vec<tree> *result_chain) 4246 { 4247 tree vect1, vect2, high, low; 4248 gimple perm_stmt; 4249 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4250 tree perm_mask_low, perm_mask_high; 4251 unsigned int i, n; 4252 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); 4253 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4254 4255 result_chain->quick_grow (length); 4256 memcpy (result_chain->address (), dr_chain.address (), 4257 length * sizeof (tree)); 4258 4259 for (i = 0, n = nelt / 2; i < n; i++) 4260 { 4261 sel[i * 2] = i; 4262 sel[i * 2 + 1] = i + nelt; 4263 } 4264 perm_mask_high = vect_gen_perm_mask (vectype, sel); 4265 gcc_assert (perm_mask_high != NULL); 4266 4267 for (i = 0; i < nelt; i++) 4268 sel[i] += nelt / 2; 4269 perm_mask_low = vect_gen_perm_mask (vectype, sel); 4270 gcc_assert (perm_mask_low != NULL); 4271 4272 for (i = 0, n = exact_log2 (length); i < n; i++) 4273 { 4274 for (j = 0; j < length/2; j++) 4275 { 4276 vect1 = dr_chain[j]; 4277 vect2 = dr_chain[j+length/2]; 4278 4279 /* Create interleaving stmt: 4280 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */ 4281 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 4282 perm_stmt 4283 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, 4284 vect1, vect2, perm_mask_high); 4285 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4286 (*result_chain)[2*j] = high; 4287 4288 /* Create interleaving stmt: 4289 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1, 4290 nelt*3/2+1, ...}> */ 4291 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 4292 perm_stmt 4293 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, 4294 vect1, vect2, perm_mask_low); 4295 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4296 (*result_chain)[2*j+1] = low; 4297 } 4298 memcpy (dr_chain.address (), result_chain->address (), 4299 length * sizeof (tree)); 4300 } 4301 } 4302 4303 /* Function vect_setup_realignment 4304 4305 This function is called when vectorizing an unaligned load using 4306 the dr_explicit_realign[_optimized] scheme. 4307 This function generates the following code at the loop prolog: 4308 4309 p = initial_addr; 4310 x msq_init = *(floor(p)); # prolog load 4311 realignment_token = call target_builtin; 4312 loop: 4313 x msq = phi (msq_init, ---) 4314 4315 The stmts marked with x are generated only for the case of 4316 dr_explicit_realign_optimized. 4317 4318 The code above sets up a new (vector) pointer, pointing to the first 4319 location accessed by STMT, and a "floor-aligned" load using that pointer. 4320 It also generates code to compute the "realignment-token" (if the relevant 4321 target hook was defined), and creates a phi-node at the loop-header bb 4322 whose arguments are the result of the prolog-load (created by this 4323 function) and the result of a load that takes place in the loop (to be 4324 created by the caller to this function). 4325 4326 For the case of dr_explicit_realign_optimized: 4327 The caller to this function uses the phi-result (msq) to create the 4328 realignment code inside the loop, and sets up the missing phi argument, 4329 as follows: 4330 loop: 4331 msq = phi (msq_init, lsq) 4332 lsq = *(floor(p')); # load in loop 4333 result = realign_load (msq, lsq, realignment_token); 4334 4335 For the case of dr_explicit_realign: 4336 loop: 4337 msq = *(floor(p)); # load in loop 4338 p' = p + (VS-1); 4339 lsq = *(floor(p')); # load in loop 4340 result = realign_load (msq, lsq, realignment_token); 4341 4342 Input: 4343 STMT - (scalar) load stmt to be vectorized. This load accesses 4344 a memory location that may be unaligned. 4345 BSI - place where new code is to be inserted. 4346 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 4347 is used. 4348 4349 Output: 4350 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 4351 target hook, if defined. 4352 Return value - the result of the loop-header phi node. */ 4353 4354 tree 4355 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi, 4356 tree *realignment_token, 4357 enum dr_alignment_support alignment_support_scheme, 4358 tree init_addr, 4359 struct loop **at_loop) 4360 { 4361 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4362 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4363 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4364 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4365 struct loop *loop = NULL; 4366 edge pe = NULL; 4367 tree scalar_dest = gimple_assign_lhs (stmt); 4368 tree vec_dest; 4369 gimple inc; 4370 tree ptr; 4371 tree data_ref; 4372 gimple new_stmt; 4373 basic_block new_bb; 4374 tree msq_init = NULL_TREE; 4375 tree new_temp; 4376 gimple phi_stmt; 4377 tree msq = NULL_TREE; 4378 gimple_seq stmts = NULL; 4379 bool inv_p; 4380 bool compute_in_loop = false; 4381 bool nested_in_vect_loop = false; 4382 struct loop *containing_loop = (gimple_bb (stmt))->loop_father; 4383 struct loop *loop_for_initial_load = NULL; 4384 4385 if (loop_vinfo) 4386 { 4387 loop = LOOP_VINFO_LOOP (loop_vinfo); 4388 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); 4389 } 4390 4391 gcc_assert (alignment_support_scheme == dr_explicit_realign 4392 || alignment_support_scheme == dr_explicit_realign_optimized); 4393 4394 /* We need to generate three things: 4395 1. the misalignment computation 4396 2. the extra vector load (for the optimized realignment scheme). 4397 3. the phi node for the two vectors from which the realignment is 4398 done (for the optimized realignment scheme). */ 4399 4400 /* 1. Determine where to generate the misalignment computation. 4401 4402 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 4403 calculation will be generated by this function, outside the loop (in the 4404 preheader). Otherwise, INIT_ADDR had already been computed for us by the 4405 caller, inside the loop. 4406 4407 Background: If the misalignment remains fixed throughout the iterations of 4408 the loop, then both realignment schemes are applicable, and also the 4409 misalignment computation can be done outside LOOP. This is because we are 4410 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 4411 are a multiple of VS (the Vector Size), and therefore the misalignment in 4412 different vectorized LOOP iterations is always the same. 4413 The problem arises only if the memory access is in an inner-loop nested 4414 inside LOOP, which is now being vectorized using outer-loop vectorization. 4415 This is the only case when the misalignment of the memory access may not 4416 remain fixed throughout the iterations of the inner-loop (as explained in 4417 detail in vect_supportable_dr_alignment). In this case, not only is the 4418 optimized realignment scheme not applicable, but also the misalignment 4419 computation (and generation of the realignment token that is passed to 4420 REALIGN_LOAD) have to be done inside the loop. 4421 4422 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 4423 or not, which in turn determines if the misalignment is computed inside 4424 the inner-loop, or outside LOOP. */ 4425 4426 if (init_addr != NULL_TREE || !loop_vinfo) 4427 { 4428 compute_in_loop = true; 4429 gcc_assert (alignment_support_scheme == dr_explicit_realign); 4430 } 4431 4432 4433 /* 2. Determine where to generate the extra vector load. 4434 4435 For the optimized realignment scheme, instead of generating two vector 4436 loads in each iteration, we generate a single extra vector load in the 4437 preheader of the loop, and in each iteration reuse the result of the 4438 vector load from the previous iteration. In case the memory access is in 4439 an inner-loop nested inside LOOP, which is now being vectorized using 4440 outer-loop vectorization, we need to determine whether this initial vector 4441 load should be generated at the preheader of the inner-loop, or can be 4442 generated at the preheader of LOOP. If the memory access has no evolution 4443 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 4444 to be generated inside LOOP (in the preheader of the inner-loop). */ 4445 4446 if (nested_in_vect_loop) 4447 { 4448 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 4449 bool invariant_in_outerloop = 4450 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 4451 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 4452 } 4453 else 4454 loop_for_initial_load = loop; 4455 if (at_loop) 4456 *at_loop = loop_for_initial_load; 4457 4458 if (loop_for_initial_load) 4459 pe = loop_preheader_edge (loop_for_initial_load); 4460 4461 /* 3. For the case of the optimized realignment, create the first vector 4462 load at the loop preheader. */ 4463 4464 if (alignment_support_scheme == dr_explicit_realign_optimized) 4465 { 4466 /* Create msq_init = *(floor(p1)) in the loop preheader */ 4467 4468 gcc_assert (!compute_in_loop); 4469 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4470 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, 4471 NULL_TREE, &init_addr, NULL, &inc, 4472 true, &inv_p); 4473 new_temp = copy_ssa_name (ptr, NULL); 4474 new_stmt = gimple_build_assign_with_ops 4475 (BIT_AND_EXPR, new_temp, ptr, 4476 build_int_cst (TREE_TYPE (ptr), 4477 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); 4478 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4479 gcc_assert (!new_bb); 4480 data_ref 4481 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 4482 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 4483 new_stmt = gimple_build_assign (vec_dest, data_ref); 4484 new_temp = make_ssa_name (vec_dest, new_stmt); 4485 gimple_assign_set_lhs (new_stmt, new_temp); 4486 if (pe) 4487 { 4488 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4489 gcc_assert (!new_bb); 4490 } 4491 else 4492 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 4493 4494 msq_init = gimple_assign_lhs (new_stmt); 4495 } 4496 4497 /* 4. Create realignment token using a target builtin, if available. 4498 It is done either inside the containing loop, or before LOOP (as 4499 determined above). */ 4500 4501 if (targetm.vectorize.builtin_mask_for_load) 4502 { 4503 tree builtin_decl; 4504 4505 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 4506 if (!init_addr) 4507 { 4508 /* Generate the INIT_ADDR computation outside LOOP. */ 4509 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, 4510 NULL_TREE, loop); 4511 if (loop) 4512 { 4513 pe = loop_preheader_edge (loop); 4514 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 4515 gcc_assert (!new_bb); 4516 } 4517 else 4518 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 4519 } 4520 4521 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 4522 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 4523 vec_dest = 4524 vect_create_destination_var (scalar_dest, 4525 gimple_call_return_type (new_stmt)); 4526 new_temp = make_ssa_name (vec_dest, new_stmt); 4527 gimple_call_set_lhs (new_stmt, new_temp); 4528 4529 if (compute_in_loop) 4530 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 4531 else 4532 { 4533 /* Generate the misalignment computation outside LOOP. */ 4534 pe = loop_preheader_edge (loop); 4535 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 4536 gcc_assert (!new_bb); 4537 } 4538 4539 *realignment_token = gimple_call_lhs (new_stmt); 4540 4541 /* The result of the CALL_EXPR to this builtin is determined from 4542 the value of the parameter and no global variables are touched 4543 which makes the builtin a "const" function. Requiring the 4544 builtin to have the "const" attribute makes it unnecessary 4545 to call mark_call_clobbered. */ 4546 gcc_assert (TREE_READONLY (builtin_decl)); 4547 } 4548 4549 if (alignment_support_scheme == dr_explicit_realign) 4550 return msq; 4551 4552 gcc_assert (!compute_in_loop); 4553 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 4554 4555 4556 /* 5. Create msq = phi <msq_init, lsq> in loop */ 4557 4558 pe = loop_preheader_edge (containing_loop); 4559 vec_dest = vect_create_destination_var (scalar_dest, vectype); 4560 msq = make_ssa_name (vec_dest, NULL); 4561 phi_stmt = create_phi_node (msq, containing_loop->header); 4562 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 4563 4564 return msq; 4565 } 4566 4567 4568 /* Function vect_grouped_load_supported. 4569 4570 Returns TRUE if even and odd permutations are supported, 4571 and FALSE otherwise. */ 4572 4573 bool 4574 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) 4575 { 4576 enum machine_mode mode = TYPE_MODE (vectype); 4577 4578 /* vect_permute_load_chain requires the group size to be a power of two. */ 4579 if (exact_log2 (count) == -1) 4580 { 4581 if (dump_enabled_p ()) 4582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4583 "the size of the group of accesses" 4584 " is not a power of 2"); 4585 return false; 4586 } 4587 4588 /* Check that the permutation is supported. */ 4589 if (VECTOR_MODE_P (mode)) 4590 { 4591 unsigned int i, nelt = GET_MODE_NUNITS (mode); 4592 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4593 4594 for (i = 0; i < nelt; i++) 4595 sel[i] = i * 2; 4596 if (can_vec_perm_p (mode, false, sel)) 4597 { 4598 for (i = 0; i < nelt; i++) 4599 sel[i] = i * 2 + 1; 4600 if (can_vec_perm_p (mode, false, sel)) 4601 return true; 4602 } 4603 } 4604 4605 if (dump_enabled_p ()) 4606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4607 "extract even/odd not supported by target"); 4608 return false; 4609 } 4610 4611 /* Return TRUE if vec_load_lanes is available for COUNT vectors of 4612 type VECTYPE. */ 4613 4614 bool 4615 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) 4616 { 4617 return vect_lanes_optab_supported_p ("vec_load_lanes", 4618 vec_load_lanes_optab, 4619 vectype, count); 4620 } 4621 4622 /* Function vect_permute_load_chain. 4623 4624 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 4625 a power of 2, generate extract_even/odd stmts to reorder the input data 4626 correctly. Return the final references for loads in RESULT_CHAIN. 4627 4628 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 4629 The input is 4 vectors each containing 8 elements. We assign a number to each 4630 element, the input sequence is: 4631 4632 1st vec: 0 1 2 3 4 5 6 7 4633 2nd vec: 8 9 10 11 12 13 14 15 4634 3rd vec: 16 17 18 19 20 21 22 23 4635 4th vec: 24 25 26 27 28 29 30 31 4636 4637 The output sequence should be: 4638 4639 1st vec: 0 4 8 12 16 20 24 28 4640 2nd vec: 1 5 9 13 17 21 25 29 4641 3rd vec: 2 6 10 14 18 22 26 30 4642 4th vec: 3 7 11 15 19 23 27 31 4643 4644 i.e., the first output vector should contain the first elements of each 4645 interleaving group, etc. 4646 4647 We use extract_even/odd instructions to create such output. The input of 4648 each extract_even/odd operation is two vectors 4649 1st vec 2nd vec 4650 0 1 2 3 4 5 6 7 4651 4652 and the output is the vector of extracted even/odd elements. The output of 4653 extract_even will be: 0 2 4 6 4654 and of extract_odd: 1 3 5 7 4655 4656 4657 The permutation is done in log LENGTH stages. In each stage extract_even 4658 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 4659 their order. In our example, 4660 4661 E1: extract_even (1st vec, 2nd vec) 4662 E2: extract_odd (1st vec, 2nd vec) 4663 E3: extract_even (3rd vec, 4th vec) 4664 E4: extract_odd (3rd vec, 4th vec) 4665 4666 The output for the first stage will be: 4667 4668 E1: 0 2 4 6 8 10 12 14 4669 E2: 1 3 5 7 9 11 13 15 4670 E3: 16 18 20 22 24 26 28 30 4671 E4: 17 19 21 23 25 27 29 31 4672 4673 In order to proceed and create the correct sequence for the next stage (or 4674 for the correct output, if the second stage is the last one, as in our 4675 example), we first put the output of extract_even operation and then the 4676 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 4677 The input for the second stage is: 4678 4679 1st vec (E1): 0 2 4 6 8 10 12 14 4680 2nd vec (E3): 16 18 20 22 24 26 28 30 4681 3rd vec (E2): 1 3 5 7 9 11 13 15 4682 4th vec (E4): 17 19 21 23 25 27 29 31 4683 4684 The output of the second stage: 4685 4686 E1: 0 4 8 12 16 20 24 28 4687 E2: 2 6 10 14 18 22 26 30 4688 E3: 1 5 9 13 17 21 25 29 4689 E4: 3 7 11 15 19 23 27 31 4690 4691 And RESULT_CHAIN after reordering: 4692 4693 1st vec (E1): 0 4 8 12 16 20 24 28 4694 2nd vec (E3): 1 5 9 13 17 21 25 29 4695 3rd vec (E2): 2 6 10 14 18 22 26 30 4696 4th vec (E4): 3 7 11 15 19 23 27 31. */ 4697 4698 static void 4699 vect_permute_load_chain (vec<tree> dr_chain, 4700 unsigned int length, 4701 gimple stmt, 4702 gimple_stmt_iterator *gsi, 4703 vec<tree> *result_chain) 4704 { 4705 tree data_ref, first_vect, second_vect; 4706 tree perm_mask_even, perm_mask_odd; 4707 gimple perm_stmt; 4708 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); 4709 unsigned int i, j, log_length = exact_log2 (length); 4710 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); 4711 unsigned char *sel = XALLOCAVEC (unsigned char, nelt); 4712 4713 result_chain->quick_grow (length); 4714 memcpy (result_chain->address (), dr_chain.address (), 4715 length * sizeof (tree)); 4716 4717 for (i = 0; i < nelt; ++i) 4718 sel[i] = i * 2; 4719 perm_mask_even = vect_gen_perm_mask (vectype, sel); 4720 gcc_assert (perm_mask_even != NULL); 4721 4722 for (i = 0; i < nelt; ++i) 4723 sel[i] = i * 2 + 1; 4724 perm_mask_odd = vect_gen_perm_mask (vectype, sel); 4725 gcc_assert (perm_mask_odd != NULL); 4726 4727 for (i = 0; i < log_length; i++) 4728 { 4729 for (j = 0; j < length; j += 2) 4730 { 4731 first_vect = dr_chain[j]; 4732 second_vect = dr_chain[j+1]; 4733 4734 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 4735 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 4736 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, 4737 first_vect, second_vect, 4738 perm_mask_even); 4739 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4740 (*result_chain)[j/2] = data_ref; 4741 4742 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 4743 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 4744 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, 4745 first_vect, second_vect, 4746 perm_mask_odd); 4747 vect_finish_stmt_generation (stmt, perm_stmt, gsi); 4748 (*result_chain)[j/2+length/2] = data_ref; 4749 } 4750 memcpy (dr_chain.address (), result_chain->address (), 4751 length * sizeof (tree)); 4752 } 4753 } 4754 4755 4756 /* Function vect_transform_grouped_load. 4757 4758 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 4759 to perform their permutation and ascribe the result vectorized statements to 4760 the scalar statements. 4761 */ 4762 4763 void 4764 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size, 4765 gimple_stmt_iterator *gsi) 4766 { 4767 vec<tree> result_chain = vNULL; 4768 4769 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 4770 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 4771 vectors, that are ready for vector computation. */ 4772 result_chain.create (size); 4773 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); 4774 vect_record_grouped_load_vectors (stmt, result_chain); 4775 result_chain.release (); 4776 } 4777 4778 /* RESULT_CHAIN contains the output of a group of grouped loads that were 4779 generated as part of the vectorization of STMT. Assign the statement 4780 for each vector to the associated scalar statement. */ 4781 4782 void 4783 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain) 4784 { 4785 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); 4786 gimple next_stmt, new_stmt; 4787 unsigned int i, gap_count; 4788 tree tmp_data_ref; 4789 4790 /* Put a permuted data-ref in the VECTORIZED_STMT field. 4791 Since we scan the chain starting from it's first node, their order 4792 corresponds the order of data-refs in RESULT_CHAIN. */ 4793 next_stmt = first_stmt; 4794 gap_count = 1; 4795 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 4796 { 4797 if (!next_stmt) 4798 break; 4799 4800 /* Skip the gaps. Loads created for the gaps will be removed by dead 4801 code elimination pass later. No need to check for the first stmt in 4802 the group, since it always exists. 4803 GROUP_GAP is the number of steps in elements from the previous 4804 access (if there is no gap GROUP_GAP is 1). We skip loads that 4805 correspond to the gaps. */ 4806 if (next_stmt != first_stmt 4807 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) 4808 { 4809 gap_count++; 4810 continue; 4811 } 4812 4813 while (next_stmt) 4814 { 4815 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); 4816 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 4817 copies, and we put the new vector statement in the first available 4818 RELATED_STMT. */ 4819 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) 4820 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; 4821 else 4822 { 4823 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 4824 { 4825 gimple prev_stmt = 4826 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); 4827 gimple rel_stmt = 4828 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); 4829 while (rel_stmt) 4830 { 4831 prev_stmt = rel_stmt; 4832 rel_stmt = 4833 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); 4834 } 4835 4836 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = 4837 new_stmt; 4838 } 4839 } 4840 4841 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); 4842 gap_count = 1; 4843 /* If NEXT_STMT accesses the same DR as the previous statement, 4844 put the same TMP_DATA_REF as its vectorized statement; otherwise 4845 get the next data-ref from RESULT_CHAIN. */ 4846 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) 4847 break; 4848 } 4849 } 4850 } 4851 4852 /* Function vect_force_dr_alignment_p. 4853 4854 Returns whether the alignment of a DECL can be forced to be aligned 4855 on ALIGNMENT bit boundary. */ 4856 4857 bool 4858 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) 4859 { 4860 if (TREE_CODE (decl) != VAR_DECL) 4861 return false; 4862 4863 /* We cannot change alignment of common or external symbols as another 4864 translation unit may contain a definition with lower alignment. 4865 The rules of common symbol linking mean that the definition 4866 will override the common symbol. The same is true for constant 4867 pool entries which may be shared and are not properly merged 4868 by LTO. */ 4869 if (DECL_EXTERNAL (decl) 4870 || DECL_COMMON (decl) 4871 || DECL_IN_CONSTANT_POOL (decl)) 4872 return false; 4873 4874 if (TREE_ASM_WRITTEN (decl)) 4875 return false; 4876 4877 /* Do not override the alignment as specified by the ABI when the used 4878 attribute is set. */ 4879 if (DECL_PRESERVE_P (decl)) 4880 return false; 4881 4882 /* Do not override explicit alignment set by the user when an explicit 4883 section name is also used. This is a common idiom used by many 4884 software projects. */ 4885 if (DECL_SECTION_NAME (decl) != NULL_TREE 4886 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl)) 4887 return false; 4888 4889 if (TREE_STATIC (decl)) 4890 return (alignment <= MAX_OFILE_ALIGNMENT); 4891 else 4892 return (alignment <= MAX_STACK_ALIGNMENT); 4893 } 4894 4895 4896 /* Return whether the data reference DR is supported with respect to its 4897 alignment. 4898 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 4899 it is aligned, i.e., check if it is possible to vectorize it with different 4900 alignment. */ 4901 4902 enum dr_alignment_support 4903 vect_supportable_dr_alignment (struct data_reference *dr, 4904 bool check_aligned_accesses) 4905 { 4906 gimple stmt = DR_STMT (dr); 4907 stmt_vec_info stmt_info = vinfo_for_stmt (stmt); 4908 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4909 enum machine_mode mode = TYPE_MODE (vectype); 4910 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4911 struct loop *vect_loop = NULL; 4912 bool nested_in_vect_loop = false; 4913 4914 if (aligned_access_p (dr) && !check_aligned_accesses) 4915 return dr_aligned; 4916 4917 if (loop_vinfo) 4918 { 4919 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 4920 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); 4921 } 4922 4923 /* Possibly unaligned access. */ 4924 4925 /* We can choose between using the implicit realignment scheme (generating 4926 a misaligned_move stmt) and the explicit realignment scheme (generating 4927 aligned loads with a REALIGN_LOAD). There are two variants to the 4928 explicit realignment scheme: optimized, and unoptimized. 4929 We can optimize the realignment only if the step between consecutive 4930 vector loads is equal to the vector size. Since the vector memory 4931 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 4932 is guaranteed that the misalignment amount remains the same throughout the 4933 execution of the vectorized loop. Therefore, we can create the 4934 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 4935 at the loop preheader. 4936 4937 However, in the case of outer-loop vectorization, when vectorizing a 4938 memory access in the inner-loop nested within the LOOP that is now being 4939 vectorized, while it is guaranteed that the misalignment of the 4940 vectorized memory access will remain the same in different outer-loop 4941 iterations, it is *not* guaranteed that is will remain the same throughout 4942 the execution of the inner-loop. This is because the inner-loop advances 4943 with the original scalar step (and not in steps of VS). If the inner-loop 4944 step happens to be a multiple of VS, then the misalignment remains fixed 4945 and we can use the optimized realignment scheme. For example: 4946 4947 for (i=0; i<N; i++) 4948 for (j=0; j<M; j++) 4949 s += a[i+j]; 4950 4951 When vectorizing the i-loop in the above example, the step between 4952 consecutive vector loads is 1, and so the misalignment does not remain 4953 fixed across the execution of the inner-loop, and the realignment cannot 4954 be optimized (as illustrated in the following pseudo vectorized loop): 4955 4956 for (i=0; i<N; i+=4) 4957 for (j=0; j<M; j++){ 4958 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 4959 // when j is {0,1,2,3,4,5,6,7,...} respectively. 4960 // (assuming that we start from an aligned address). 4961 } 4962 4963 We therefore have to use the unoptimized realignment scheme: 4964 4965 for (i=0; i<N; i+=4) 4966 for (j=k; j<M; j+=4) 4967 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 4968 // that the misalignment of the initial address is 4969 // 0). 4970 4971 The loop can then be vectorized as follows: 4972 4973 for (k=0; k<4; k++){ 4974 rt = get_realignment_token (&vp[k]); 4975 for (i=0; i<N; i+=4){ 4976 v1 = vp[i+k]; 4977 for (j=k; j<M; j+=4){ 4978 v2 = vp[i+j+VS-1]; 4979 va = REALIGN_LOAD <v1,v2,rt>; 4980 vs += va; 4981 v1 = v2; 4982 } 4983 } 4984 } */ 4985 4986 if (DR_IS_READ (dr)) 4987 { 4988 bool is_packed = false; 4989 tree type = (TREE_TYPE (DR_REF (dr))); 4990 4991 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 4992 && (!targetm.vectorize.builtin_mask_for_load 4993 || targetm.vectorize.builtin_mask_for_load ())) 4994 { 4995 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4996 if ((nested_in_vect_loop 4997 && (TREE_INT_CST_LOW (DR_STEP (dr)) 4998 != GET_MODE_SIZE (TYPE_MODE (vectype)))) 4999 || !loop_vinfo) 5000 return dr_explicit_realign; 5001 else 5002 return dr_explicit_realign_optimized; 5003 } 5004 if (!known_alignment_for_access_p (dr)) 5005 is_packed = not_size_aligned (DR_REF (dr)); 5006 5007 if (targetm.vectorize. 5008 support_vector_misalignment (mode, type, 5009 DR_MISALIGNMENT (dr), is_packed)) 5010 /* Can't software pipeline the loads, but can at least do them. */ 5011 return dr_unaligned_supported; 5012 } 5013 else 5014 { 5015 bool is_packed = false; 5016 tree type = (TREE_TYPE (DR_REF (dr))); 5017 5018 if (!known_alignment_for_access_p (dr)) 5019 is_packed = not_size_aligned (DR_REF (dr)); 5020 5021 if (targetm.vectorize. 5022 support_vector_misalignment (mode, type, 5023 DR_MISALIGNMENT (dr), is_packed)) 5024 return dr_unaligned_supported; 5025 } 5026 5027 /* Unsupported. */ 5028 return dr_unaligned_unsupported; 5029 } 5030