1 /* Loop autoparallelization. 2 Copyright (C) 2006-2013 Free Software Foundation, Inc. 3 Contributed by Sebastian Pop <pop@cri.ensmp.fr> 4 Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>. 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "tree-flow.h" 26 #include "cfgloop.h" 27 #include "tree-data-ref.h" 28 #include "tree-scalar-evolution.h" 29 #include "gimple-pretty-print.h" 30 #include "tree-pass.h" 31 #include "langhooks.h" 32 #include "tree-vectorizer.h" 33 34 /* This pass tries to distribute iterations of loops into several threads. 35 The implementation is straightforward -- for each loop we test whether its 36 iterations are independent, and if it is the case (and some additional 37 conditions regarding profitability and correctness are satisfied), we 38 add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion 39 machinery do its job. 40 41 The most of the complexity is in bringing the code into shape expected 42 by the omp expanders: 43 -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction 44 variable and that the exit test is at the start of the loop body 45 -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable 46 variables by accesses through pointers, and breaking up ssa chains 47 by storing the values incoming to the parallelized loop to a structure 48 passed to the new function as an argument (something similar is done 49 in omp gimplification, unfortunately only a small part of the code 50 can be shared). 51 52 TODO: 53 -- if there are several parallelizable loops in a function, it may be 54 possible to generate the threads just once (using synchronization to 55 ensure that cross-loop dependences are obeyed). 56 -- handling of common reduction patterns for outer loops. 57 58 More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC */ 59 /* 60 Reduction handling: 61 currently we use vect_force_simple_reduction() to detect reduction patterns. 62 The code transformation will be introduced by an example. 63 64 65 parloop 66 { 67 int sum=1; 68 69 for (i = 0; i < N; i++) 70 { 71 x[i] = i + 3; 72 sum+=x[i]; 73 } 74 } 75 76 gimple-like code: 77 header_bb: 78 79 # sum_29 = PHI <sum_11(5), 1(3)> 80 # i_28 = PHI <i_12(5), 0(3)> 81 D.1795_8 = i_28 + 3; 82 x[i_28] = D.1795_8; 83 sum_11 = D.1795_8 + sum_29; 84 i_12 = i_28 + 1; 85 if (N_6(D) > i_12) 86 goto header_bb; 87 88 89 exit_bb: 90 91 # sum_21 = PHI <sum_11(4)> 92 printf (&"%d"[0], sum_21); 93 94 95 after reduction transformation (only relevant parts): 96 97 parloop 98 { 99 100 .... 101 102 103 # Storing the initial value given by the user. # 104 105 .paral_data_store.32.sum.27 = 1; 106 107 #pragma omp parallel num_threads(4) 108 109 #pragma omp for schedule(static) 110 111 # The neutral element corresponding to the particular 112 reduction's operation, e.g. 0 for PLUS_EXPR, 113 1 for MULT_EXPR, etc. replaces the user's initial value. # 114 115 # sum.27_29 = PHI <sum.27_11, 0> 116 117 sum.27_11 = D.1827_8 + sum.27_29; 118 119 GIMPLE_OMP_CONTINUE 120 121 # Adding this reduction phi is done at create_phi_for_local_result() # 122 # sum.27_56 = PHI <sum.27_11, 0> 123 GIMPLE_OMP_RETURN 124 125 # Creating the atomic operation is done at 126 create_call_for_reduction_1() # 127 128 #pragma omp atomic_load 129 D.1839_59 = *&.paral_data_load.33_51->reduction.23; 130 D.1840_60 = sum.27_56 + D.1839_59; 131 #pragma omp atomic_store (D.1840_60); 132 133 GIMPLE_OMP_RETURN 134 135 # collecting the result after the join of the threads is done at 136 create_loads_for_reductions(). 137 The value computed by the threads is loaded from the 138 shared struct. # 139 140 141 .paral_data_load.33_52 = &.paral_data_store.32; 142 sum_37 = .paral_data_load.33_52->sum.27; 143 sum_43 = D.1795_41 + sum_37; 144 145 exit bb: 146 # sum_21 = PHI <sum_43, sum_26> 147 printf (&"%d"[0], sum_21); 148 149 ... 150 151 } 152 153 */ 154 155 /* Minimal number of iterations of a loop that should be executed in each 156 thread. */ 157 #define MIN_PER_THREAD 100 158 159 /* Element of the hashtable, representing a 160 reduction in the current loop. */ 161 struct reduction_info 162 { 163 gimple reduc_stmt; /* reduction statement. */ 164 gimple reduc_phi; /* The phi node defining the reduction. */ 165 enum tree_code reduction_code;/* code for the reduction operation. */ 166 unsigned reduc_version; /* SSA_NAME_VERSION of original reduc_phi 167 result. */ 168 gimple keep_res; /* The PHI_RESULT of this phi is the resulting value 169 of the reduction variable when existing the loop. */ 170 tree initial_value; /* The initial value of the reduction var before entering the loop. */ 171 tree field; /* the name of the field in the parloop data structure intended for reduction. */ 172 tree init; /* reduction initialization value. */ 173 gimple new_phi; /* (helper field) Newly created phi node whose result 174 will be passed to the atomic operation. Represents 175 the local result each thread computed for the reduction 176 operation. */ 177 }; 178 179 /* Equality and hash functions for hashtab code. */ 180 181 static int 182 reduction_info_eq (const void *aa, const void *bb) 183 { 184 const struct reduction_info *a = (const struct reduction_info *) aa; 185 const struct reduction_info *b = (const struct reduction_info *) bb; 186 187 return (a->reduc_phi == b->reduc_phi); 188 } 189 190 static hashval_t 191 reduction_info_hash (const void *aa) 192 { 193 const struct reduction_info *a = (const struct reduction_info *) aa; 194 195 return a->reduc_version; 196 } 197 198 static struct reduction_info * 199 reduction_phi (htab_t reduction_list, gimple phi) 200 { 201 struct reduction_info tmpred, *red; 202 203 if (htab_elements (reduction_list) == 0 || phi == NULL) 204 return NULL; 205 206 tmpred.reduc_phi = phi; 207 tmpred.reduc_version = gimple_uid (phi); 208 red = (struct reduction_info *) htab_find (reduction_list, &tmpred); 209 210 return red; 211 } 212 213 /* Element of hashtable of names to copy. */ 214 215 struct name_to_copy_elt 216 { 217 unsigned version; /* The version of the name to copy. */ 218 tree new_name; /* The new name used in the copy. */ 219 tree field; /* The field of the structure used to pass the 220 value. */ 221 }; 222 223 /* Equality and hash functions for hashtab code. */ 224 225 static int 226 name_to_copy_elt_eq (const void *aa, const void *bb) 227 { 228 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa; 229 const struct name_to_copy_elt *b = (const struct name_to_copy_elt *) bb; 230 231 return a->version == b->version; 232 } 233 234 static hashval_t 235 name_to_copy_elt_hash (const void *aa) 236 { 237 const struct name_to_copy_elt *a = (const struct name_to_copy_elt *) aa; 238 239 return (hashval_t) a->version; 240 } 241 242 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE 243 matrix. Rather than use floats, we simply keep a single DENOMINATOR that 244 represents the denominator for every element in the matrix. */ 245 typedef struct lambda_trans_matrix_s 246 { 247 lambda_matrix matrix; 248 int rowsize; 249 int colsize; 250 int denominator; 251 } *lambda_trans_matrix; 252 #define LTM_MATRIX(T) ((T)->matrix) 253 #define LTM_ROWSIZE(T) ((T)->rowsize) 254 #define LTM_COLSIZE(T) ((T)->colsize) 255 #define LTM_DENOMINATOR(T) ((T)->denominator) 256 257 /* Allocate a new transformation matrix. */ 258 259 static lambda_trans_matrix 260 lambda_trans_matrix_new (int colsize, int rowsize, 261 struct obstack * lambda_obstack) 262 { 263 lambda_trans_matrix ret; 264 265 ret = (lambda_trans_matrix) 266 obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s)); 267 LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack); 268 LTM_ROWSIZE (ret) = rowsize; 269 LTM_COLSIZE (ret) = colsize; 270 LTM_DENOMINATOR (ret) = 1; 271 return ret; 272 } 273 274 /* Multiply a vector VEC by a matrix MAT. 275 MAT is an M*N matrix, and VEC is a vector with length N. The result 276 is stored in DEST which must be a vector of length M. */ 277 278 static void 279 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n, 280 lambda_vector vec, lambda_vector dest) 281 { 282 int i, j; 283 284 lambda_vector_clear (dest, m); 285 for (i = 0; i < m; i++) 286 for (j = 0; j < n; j++) 287 dest[i] += matrix[i][j] * vec[j]; 288 } 289 290 /* Return true if TRANS is a legal transformation matrix that respects 291 the dependence vectors in DISTS and DIRS. The conservative answer 292 is false. 293 294 "Wolfe proves that a unimodular transformation represented by the 295 matrix T is legal when applied to a loop nest with a set of 296 lexicographically non-negative distance vectors RDG if and only if 297 for each vector d in RDG, (T.d >= 0) is lexicographically positive. 298 i.e.: if and only if it transforms the lexicographically positive 299 distance vectors to lexicographically positive vectors. Note that 300 a unimodular matrix must transform the zero vector (and only it) to 301 the zero vector." S.Muchnick. */ 302 303 static bool 304 lambda_transform_legal_p (lambda_trans_matrix trans, 305 int nb_loops, 306 vec<ddr_p> dependence_relations) 307 { 308 unsigned int i, j; 309 lambda_vector distres; 310 struct data_dependence_relation *ddr; 311 312 gcc_assert (LTM_COLSIZE (trans) == nb_loops 313 && LTM_ROWSIZE (trans) == nb_loops); 314 315 /* When there are no dependences, the transformation is correct. */ 316 if (dependence_relations.length () == 0) 317 return true; 318 319 ddr = dependence_relations[0]; 320 if (ddr == NULL) 321 return true; 322 323 /* When there is an unknown relation in the dependence_relations, we 324 know that it is no worth looking at this loop nest: give up. */ 325 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 326 return false; 327 328 distres = lambda_vector_new (nb_loops); 329 330 /* For each distance vector in the dependence graph. */ 331 FOR_EACH_VEC_ELT (dependence_relations, i, ddr) 332 { 333 /* Don't care about relations for which we know that there is no 334 dependence, nor about read-read (aka. output-dependences): 335 these data accesses can happen in any order. */ 336 if (DDR_ARE_DEPENDENT (ddr) == chrec_known 337 || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))) 338 continue; 339 340 /* Conservatively answer: "this transformation is not valid". */ 341 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 342 return false; 343 344 /* If the dependence could not be captured by a distance vector, 345 conservatively answer that the transform is not valid. */ 346 if (DDR_NUM_DIST_VECTS (ddr) == 0) 347 return false; 348 349 /* Compute trans.dist_vect */ 350 for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++) 351 { 352 lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops, 353 DDR_DIST_VECT (ddr, j), distres); 354 355 if (!lambda_vector_lexico_pos (distres, nb_loops)) 356 return false; 357 } 358 } 359 return true; 360 } 361 362 /* Data dependency analysis. Returns true if the iterations of LOOP 363 are independent on each other (that is, if we can execute them 364 in parallel). */ 365 366 static bool 367 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack) 368 { 369 vec<loop_p> loop_nest; 370 vec<ddr_p> dependence_relations; 371 vec<data_reference_p> datarefs; 372 lambda_trans_matrix trans; 373 bool ret = false; 374 375 if (dump_file && (dump_flags & TDF_DETAILS)) 376 { 377 fprintf (dump_file, "Considering loop %d\n", loop->num); 378 if (!loop->inner) 379 fprintf (dump_file, "loop is innermost\n"); 380 else 381 fprintf (dump_file, "loop NOT innermost\n"); 382 } 383 384 /* Check for problems with dependences. If the loop can be reversed, 385 the iterations are independent. */ 386 datarefs.create (10); 387 dependence_relations.create (10 * 10); 388 loop_nest.create (3); 389 if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs, 390 &dependence_relations)) 391 { 392 if (dump_file && (dump_flags & TDF_DETAILS)) 393 fprintf (dump_file, " FAILED: cannot analyze data dependencies\n"); 394 ret = false; 395 goto end; 396 } 397 if (dump_file && (dump_flags & TDF_DETAILS)) 398 dump_data_dependence_relations (dump_file, dependence_relations); 399 400 trans = lambda_trans_matrix_new (1, 1, parloop_obstack); 401 LTM_MATRIX (trans)[0][0] = -1; 402 403 if (lambda_transform_legal_p (trans, 1, dependence_relations)) 404 { 405 ret = true; 406 if (dump_file && (dump_flags & TDF_DETAILS)) 407 fprintf (dump_file, " SUCCESS: may be parallelized\n"); 408 } 409 else if (dump_file && (dump_flags & TDF_DETAILS)) 410 fprintf (dump_file, 411 " FAILED: data dependencies exist across iterations\n"); 412 413 end: 414 loop_nest.release (); 415 free_dependence_relations (dependence_relations); 416 free_data_refs (datarefs); 417 418 return ret; 419 } 420 421 /* Return true when LOOP contains basic blocks marked with the 422 BB_IRREDUCIBLE_LOOP flag. */ 423 424 static inline bool 425 loop_has_blocks_with_irreducible_flag (struct loop *loop) 426 { 427 unsigned i; 428 basic_block *bbs = get_loop_body_in_dom_order (loop); 429 bool res = true; 430 431 for (i = 0; i < loop->num_nodes; i++) 432 if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP) 433 goto end; 434 435 res = false; 436 end: 437 free (bbs); 438 return res; 439 } 440 441 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name. 442 The assignment statement is placed on edge ENTRY. DECL_ADDRESS maps decls 443 to their addresses that can be reused. The address of OBJ is known to 444 be invariant in the whole function. Other needed statements are placed 445 right before GSI. */ 446 447 static tree 448 take_address_of (tree obj, tree type, edge entry, htab_t decl_address, 449 gimple_stmt_iterator *gsi) 450 { 451 int uid; 452 void **dslot; 453 struct int_tree_map ielt, *nielt; 454 tree *var_p, name, addr; 455 gimple stmt; 456 gimple_seq stmts; 457 458 /* Since the address of OBJ is invariant, the trees may be shared. 459 Avoid rewriting unrelated parts of the code. */ 460 obj = unshare_expr (obj); 461 for (var_p = &obj; 462 handled_component_p (*var_p); 463 var_p = &TREE_OPERAND (*var_p, 0)) 464 continue; 465 466 /* Canonicalize the access to base on a MEM_REF. */ 467 if (DECL_P (*var_p)) 468 *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p)); 469 470 /* Assign a canonical SSA name to the address of the base decl used 471 in the address and share it for all accesses and addresses based 472 on it. */ 473 uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0)); 474 ielt.uid = uid; 475 dslot = htab_find_slot_with_hash (decl_address, &ielt, uid, INSERT); 476 if (!*dslot) 477 { 478 if (gsi == NULL) 479 return NULL; 480 addr = TREE_OPERAND (*var_p, 0); 481 const char *obj_name 482 = get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0)); 483 if (obj_name) 484 name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name); 485 else 486 name = make_ssa_name (TREE_TYPE (addr), NULL); 487 stmt = gimple_build_assign (name, addr); 488 gsi_insert_on_edge_immediate (entry, stmt); 489 490 nielt = XNEW (struct int_tree_map); 491 nielt->uid = uid; 492 nielt->to = name; 493 *dslot = nielt; 494 } 495 else 496 name = ((struct int_tree_map *) *dslot)->to; 497 498 /* Express the address in terms of the canonical SSA name. */ 499 TREE_OPERAND (*var_p, 0) = name; 500 if (gsi == NULL) 501 return build_fold_addr_expr_with_type (obj, type); 502 503 name = force_gimple_operand (build_addr (obj, current_function_decl), 504 &stmts, true, NULL_TREE); 505 if (!gimple_seq_empty_p (stmts)) 506 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 507 508 if (!useless_type_conversion_p (type, TREE_TYPE (name))) 509 { 510 name = force_gimple_operand (fold_convert (type, name), &stmts, true, 511 NULL_TREE); 512 if (!gimple_seq_empty_p (stmts)) 513 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 514 } 515 516 return name; 517 } 518 519 /* Callback for htab_traverse. Create the initialization statement 520 for reduction described in SLOT, and place it at the preheader of 521 the loop described in DATA. */ 522 523 static int 524 initialize_reductions (void **slot, void *data) 525 { 526 tree init, c; 527 tree bvar, type, arg; 528 edge e; 529 530 struct reduction_info *const reduc = (struct reduction_info *) *slot; 531 struct loop *loop = (struct loop *) data; 532 533 /* Create initialization in preheader: 534 reduction_variable = initialization value of reduction. */ 535 536 /* In the phi node at the header, replace the argument coming 537 from the preheader with the reduction initialization value. */ 538 539 /* Create a new variable to initialize the reduction. */ 540 type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 541 bvar = create_tmp_var (type, "reduction"); 542 543 c = build_omp_clause (gimple_location (reduc->reduc_stmt), 544 OMP_CLAUSE_REDUCTION); 545 OMP_CLAUSE_REDUCTION_CODE (c) = reduc->reduction_code; 546 OMP_CLAUSE_DECL (c) = SSA_NAME_VAR (gimple_assign_lhs (reduc->reduc_stmt)); 547 548 init = omp_reduction_init (c, TREE_TYPE (bvar)); 549 reduc->init = init; 550 551 /* Replace the argument representing the initialization value 552 with the initialization value for the reduction (neutral 553 element for the particular operation, e.g. 0 for PLUS_EXPR, 554 1 for MULT_EXPR, etc). 555 Keep the old value in a new variable "reduction_initial", 556 that will be taken in consideration after the parallel 557 computing is done. */ 558 559 e = loop_preheader_edge (loop); 560 arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e); 561 /* Create new variable to hold the initial value. */ 562 563 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE 564 (reduc->reduc_phi, loop_preheader_edge (loop)), init); 565 reduc->initial_value = arg; 566 return 1; 567 } 568 569 struct elv_data 570 { 571 struct walk_stmt_info info; 572 edge entry; 573 htab_t decl_address; 574 gimple_stmt_iterator *gsi; 575 bool changed; 576 bool reset; 577 }; 578 579 /* Eliminates references to local variables in *TP out of the single 580 entry single exit region starting at DTA->ENTRY. 581 DECL_ADDRESS contains addresses of the references that had their 582 address taken already. If the expression is changed, CHANGED is 583 set to true. Callback for walk_tree. */ 584 585 static tree 586 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data) 587 { 588 struct elv_data *const dta = (struct elv_data *) data; 589 tree t = *tp, var, addr, addr_type, type, obj; 590 591 if (DECL_P (t)) 592 { 593 *walk_subtrees = 0; 594 595 if (!SSA_VAR_P (t) || DECL_EXTERNAL (t)) 596 return NULL_TREE; 597 598 type = TREE_TYPE (t); 599 addr_type = build_pointer_type (type); 600 addr = take_address_of (t, addr_type, dta->entry, dta->decl_address, 601 dta->gsi); 602 if (dta->gsi == NULL && addr == NULL_TREE) 603 { 604 dta->reset = true; 605 return NULL_TREE; 606 } 607 608 *tp = build_simple_mem_ref (addr); 609 610 dta->changed = true; 611 return NULL_TREE; 612 } 613 614 if (TREE_CODE (t) == ADDR_EXPR) 615 { 616 /* ADDR_EXPR may appear in two contexts: 617 -- as a gimple operand, when the address taken is a function invariant 618 -- as gimple rhs, when the resulting address in not a function 619 invariant 620 We do not need to do anything special in the latter case (the base of 621 the memory reference whose address is taken may be replaced in the 622 DECL_P case). The former case is more complicated, as we need to 623 ensure that the new address is still a gimple operand. Thus, it 624 is not sufficient to replace just the base of the memory reference -- 625 we need to move the whole computation of the address out of the 626 loop. */ 627 if (!is_gimple_val (t)) 628 return NULL_TREE; 629 630 *walk_subtrees = 0; 631 obj = TREE_OPERAND (t, 0); 632 var = get_base_address (obj); 633 if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var)) 634 return NULL_TREE; 635 636 addr_type = TREE_TYPE (t); 637 addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address, 638 dta->gsi); 639 if (dta->gsi == NULL && addr == NULL_TREE) 640 { 641 dta->reset = true; 642 return NULL_TREE; 643 } 644 *tp = addr; 645 646 dta->changed = true; 647 return NULL_TREE; 648 } 649 650 if (!EXPR_P (t)) 651 *walk_subtrees = 0; 652 653 return NULL_TREE; 654 } 655 656 /* Moves the references to local variables in STMT at *GSI out of the single 657 entry single exit region starting at ENTRY. DECL_ADDRESS contains 658 addresses of the references that had their address taken 659 already. */ 660 661 static void 662 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi, 663 htab_t decl_address) 664 { 665 struct elv_data dta; 666 gimple stmt = gsi_stmt (*gsi); 667 668 memset (&dta.info, '\0', sizeof (dta.info)); 669 dta.entry = entry; 670 dta.decl_address = decl_address; 671 dta.changed = false; 672 dta.reset = false; 673 674 if (gimple_debug_bind_p (stmt)) 675 { 676 dta.gsi = NULL; 677 walk_tree (gimple_debug_bind_get_value_ptr (stmt), 678 eliminate_local_variables_1, &dta.info, NULL); 679 if (dta.reset) 680 { 681 gimple_debug_bind_reset_value (stmt); 682 dta.changed = true; 683 } 684 } 685 else if (gimple_clobber_p (stmt)) 686 { 687 stmt = gimple_build_nop (); 688 gsi_replace (gsi, stmt, false); 689 dta.changed = true; 690 } 691 else 692 { 693 dta.gsi = gsi; 694 walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info); 695 } 696 697 if (dta.changed) 698 update_stmt (stmt); 699 } 700 701 /* Eliminates the references to local variables from the single entry 702 single exit region between the ENTRY and EXIT edges. 703 704 This includes: 705 1) Taking address of a local variable -- these are moved out of the 706 region (and temporary variable is created to hold the address if 707 necessary). 708 709 2) Dereferencing a local variable -- these are replaced with indirect 710 references. */ 711 712 static void 713 eliminate_local_variables (edge entry, edge exit) 714 { 715 basic_block bb; 716 vec<basic_block> body; 717 body.create (3); 718 unsigned i; 719 gimple_stmt_iterator gsi; 720 bool has_debug_stmt = false; 721 htab_t decl_address = htab_create (10, int_tree_map_hash, int_tree_map_eq, 722 free); 723 basic_block entry_bb = entry->src; 724 basic_block exit_bb = exit->dest; 725 726 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 727 728 FOR_EACH_VEC_ELT (body, i, bb) 729 if (bb != entry_bb && bb != exit_bb) 730 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 731 if (is_gimple_debug (gsi_stmt (gsi))) 732 { 733 if (gimple_debug_bind_p (gsi_stmt (gsi))) 734 has_debug_stmt = true; 735 } 736 else 737 eliminate_local_variables_stmt (entry, &gsi, decl_address); 738 739 if (has_debug_stmt) 740 FOR_EACH_VEC_ELT (body, i, bb) 741 if (bb != entry_bb && bb != exit_bb) 742 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 743 if (gimple_debug_bind_p (gsi_stmt (gsi))) 744 eliminate_local_variables_stmt (entry, &gsi, decl_address); 745 746 htab_delete (decl_address); 747 body.release (); 748 } 749 750 /* Returns true if expression EXPR is not defined between ENTRY and 751 EXIT, i.e. if all its operands are defined outside of the region. */ 752 753 static bool 754 expr_invariant_in_region_p (edge entry, edge exit, tree expr) 755 { 756 basic_block entry_bb = entry->src; 757 basic_block exit_bb = exit->dest; 758 basic_block def_bb; 759 760 if (is_gimple_min_invariant (expr)) 761 return true; 762 763 if (TREE_CODE (expr) == SSA_NAME) 764 { 765 def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr)); 766 if (def_bb 767 && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb) 768 && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb)) 769 return false; 770 771 return true; 772 } 773 774 return false; 775 } 776 777 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME. 778 The copies are stored to NAME_COPIES, if NAME was already duplicated, 779 its duplicate stored in NAME_COPIES is returned. 780 781 Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also 782 duplicated, storing the copies in DECL_COPIES. */ 783 784 static tree 785 separate_decls_in_region_name (tree name, 786 htab_t name_copies, htab_t decl_copies, 787 bool copy_name_p) 788 { 789 tree copy, var, var_copy; 790 unsigned idx, uid, nuid; 791 struct int_tree_map ielt, *nielt; 792 struct name_to_copy_elt elt, *nelt; 793 void **slot, **dslot; 794 795 if (TREE_CODE (name) != SSA_NAME) 796 return name; 797 798 idx = SSA_NAME_VERSION (name); 799 elt.version = idx; 800 slot = htab_find_slot_with_hash (name_copies, &elt, idx, 801 copy_name_p ? INSERT : NO_INSERT); 802 if (slot && *slot) 803 return ((struct name_to_copy_elt *) *slot)->new_name; 804 805 if (copy_name_p) 806 { 807 copy = duplicate_ssa_name (name, NULL); 808 nelt = XNEW (struct name_to_copy_elt); 809 nelt->version = idx; 810 nelt->new_name = copy; 811 nelt->field = NULL_TREE; 812 *slot = nelt; 813 } 814 else 815 { 816 gcc_assert (!slot); 817 copy = name; 818 } 819 820 var = SSA_NAME_VAR (name); 821 if (!var) 822 return copy; 823 824 uid = DECL_UID (var); 825 ielt.uid = uid; 826 dslot = htab_find_slot_with_hash (decl_copies, &ielt, uid, INSERT); 827 if (!*dslot) 828 { 829 var_copy = create_tmp_var (TREE_TYPE (var), get_name (var)); 830 DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var); 831 nielt = XNEW (struct int_tree_map); 832 nielt->uid = uid; 833 nielt->to = var_copy; 834 *dslot = nielt; 835 836 /* Ensure that when we meet this decl next time, we won't duplicate 837 it again. */ 838 nuid = DECL_UID (var_copy); 839 ielt.uid = nuid; 840 dslot = htab_find_slot_with_hash (decl_copies, &ielt, nuid, INSERT); 841 gcc_assert (!*dslot); 842 nielt = XNEW (struct int_tree_map); 843 nielt->uid = nuid; 844 nielt->to = var_copy; 845 *dslot = nielt; 846 } 847 else 848 var_copy = ((struct int_tree_map *) *dslot)->to; 849 850 replace_ssa_name_symbol (copy, var_copy); 851 return copy; 852 } 853 854 /* Finds the ssa names used in STMT that are defined outside the 855 region between ENTRY and EXIT and replaces such ssa names with 856 their duplicates. The duplicates are stored to NAME_COPIES. Base 857 decls of all ssa names used in STMT (including those defined in 858 LOOP) are replaced with the new temporary variables; the 859 replacement decls are stored in DECL_COPIES. */ 860 861 static void 862 separate_decls_in_region_stmt (edge entry, edge exit, gimple stmt, 863 htab_t name_copies, htab_t decl_copies) 864 { 865 use_operand_p use; 866 def_operand_p def; 867 ssa_op_iter oi; 868 tree name, copy; 869 bool copy_name_p; 870 871 FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF) 872 { 873 name = DEF_FROM_PTR (def); 874 gcc_assert (TREE_CODE (name) == SSA_NAME); 875 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 876 false); 877 gcc_assert (copy == name); 878 } 879 880 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 881 { 882 name = USE_FROM_PTR (use); 883 if (TREE_CODE (name) != SSA_NAME) 884 continue; 885 886 copy_name_p = expr_invariant_in_region_p (entry, exit, name); 887 copy = separate_decls_in_region_name (name, name_copies, decl_copies, 888 copy_name_p); 889 SET_USE (use, copy); 890 } 891 } 892 893 /* Finds the ssa names used in STMT that are defined outside the 894 region between ENTRY and EXIT and replaces such ssa names with 895 their duplicates. The duplicates are stored to NAME_COPIES. Base 896 decls of all ssa names used in STMT (including those defined in 897 LOOP) are replaced with the new temporary variables; the 898 replacement decls are stored in DECL_COPIES. */ 899 900 static bool 901 separate_decls_in_region_debug (gimple stmt, htab_t name_copies, 902 htab_t decl_copies) 903 { 904 use_operand_p use; 905 ssa_op_iter oi; 906 tree var, name; 907 struct int_tree_map ielt; 908 struct name_to_copy_elt elt; 909 void **slot, **dslot; 910 911 if (gimple_debug_bind_p (stmt)) 912 var = gimple_debug_bind_get_var (stmt); 913 else if (gimple_debug_source_bind_p (stmt)) 914 var = gimple_debug_source_bind_get_var (stmt); 915 else 916 return true; 917 if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL) 918 return true; 919 gcc_assert (DECL_P (var) && SSA_VAR_P (var)); 920 ielt.uid = DECL_UID (var); 921 dslot = htab_find_slot_with_hash (decl_copies, &ielt, ielt.uid, NO_INSERT); 922 if (!dslot) 923 return true; 924 if (gimple_debug_bind_p (stmt)) 925 gimple_debug_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to); 926 else if (gimple_debug_source_bind_p (stmt)) 927 gimple_debug_source_bind_set_var (stmt, ((struct int_tree_map *) *dslot)->to); 928 929 FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE) 930 { 931 name = USE_FROM_PTR (use); 932 if (TREE_CODE (name) != SSA_NAME) 933 continue; 934 935 elt.version = SSA_NAME_VERSION (name); 936 slot = htab_find_slot_with_hash (name_copies, &elt, elt.version, NO_INSERT); 937 if (!slot) 938 { 939 gimple_debug_bind_reset_value (stmt); 940 update_stmt (stmt); 941 break; 942 } 943 944 SET_USE (use, ((struct name_to_copy_elt *) *slot)->new_name); 945 } 946 947 return false; 948 } 949 950 /* Callback for htab_traverse. Adds a field corresponding to the reduction 951 specified in SLOT. The type is passed in DATA. */ 952 953 static int 954 add_field_for_reduction (void **slot, void *data) 955 { 956 957 struct reduction_info *const red = (struct reduction_info *) *slot; 958 tree const type = (tree) data; 959 tree var = gimple_assign_lhs (red->reduc_stmt); 960 tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL, 961 SSA_NAME_IDENTIFIER (var), TREE_TYPE (var)); 962 963 insert_field_into_struct (type, field); 964 965 red->field = field; 966 967 return 1; 968 } 969 970 /* Callback for htab_traverse. Adds a field corresponding to a ssa name 971 described in SLOT. The type is passed in DATA. */ 972 973 static int 974 add_field_for_name (void **slot, void *data) 975 { 976 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot; 977 tree type = (tree) data; 978 tree name = ssa_name (elt->version); 979 tree field = build_decl (UNKNOWN_LOCATION, 980 FIELD_DECL, SSA_NAME_IDENTIFIER (name), 981 TREE_TYPE (name)); 982 983 insert_field_into_struct (type, field); 984 elt->field = field; 985 986 return 1; 987 } 988 989 /* Callback for htab_traverse. A local result is the intermediate result 990 computed by a single 991 thread, or the initial value in case no iteration was executed. 992 This function creates a phi node reflecting these values. 993 The phi's result will be stored in NEW_PHI field of the 994 reduction's data structure. */ 995 996 static int 997 create_phi_for_local_result (void **slot, void *data) 998 { 999 struct reduction_info *const reduc = (struct reduction_info *) *slot; 1000 const struct loop *const loop = (const struct loop *) data; 1001 edge e; 1002 gimple new_phi; 1003 basic_block store_bb; 1004 tree local_res; 1005 source_location locus; 1006 1007 /* STORE_BB is the block where the phi 1008 should be stored. It is the destination of the loop exit. 1009 (Find the fallthru edge from GIMPLE_OMP_CONTINUE). */ 1010 store_bb = FALLTHRU_EDGE (loop->latch)->dest; 1011 1012 /* STORE_BB has two predecessors. One coming from the loop 1013 (the reduction's result is computed at the loop), 1014 and another coming from a block preceding the loop, 1015 when no iterations 1016 are executed (the initial value should be taken). */ 1017 if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (loop->latch)) 1018 e = EDGE_PRED (store_bb, 1); 1019 else 1020 e = EDGE_PRED (store_bb, 0); 1021 local_res = copy_ssa_name (gimple_assign_lhs (reduc->reduc_stmt), NULL); 1022 locus = gimple_location (reduc->reduc_stmt); 1023 new_phi = create_phi_node (local_res, store_bb); 1024 add_phi_arg (new_phi, reduc->init, e, locus); 1025 add_phi_arg (new_phi, gimple_assign_lhs (reduc->reduc_stmt), 1026 FALLTHRU_EDGE (loop->latch), locus); 1027 reduc->new_phi = new_phi; 1028 1029 return 1; 1030 } 1031 1032 struct clsn_data 1033 { 1034 tree store; 1035 tree load; 1036 1037 basic_block store_bb; 1038 basic_block load_bb; 1039 }; 1040 1041 /* Callback for htab_traverse. Create an atomic instruction for the 1042 reduction described in SLOT. 1043 DATA annotates the place in memory the atomic operation relates to, 1044 and the basic block it needs to be generated in. */ 1045 1046 static int 1047 create_call_for_reduction_1 (void **slot, void *data) 1048 { 1049 struct reduction_info *const reduc = (struct reduction_info *) *slot; 1050 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1051 gimple_stmt_iterator gsi; 1052 tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi)); 1053 tree load_struct; 1054 basic_block bb; 1055 basic_block new_bb; 1056 edge e; 1057 tree t, addr, ref, x; 1058 tree tmp_load, name; 1059 gimple load; 1060 1061 load_struct = build_simple_mem_ref (clsn_data->load); 1062 t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); 1063 1064 addr = build_addr (t, current_function_decl); 1065 1066 /* Create phi node. */ 1067 bb = clsn_data->load_bb; 1068 1069 e = split_block (bb, t); 1070 new_bb = e->dest; 1071 1072 tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)), NULL); 1073 tmp_load = make_ssa_name (tmp_load, NULL); 1074 load = gimple_build_omp_atomic_load (tmp_load, addr); 1075 SSA_NAME_DEF_STMT (tmp_load) = load; 1076 gsi = gsi_start_bb (new_bb); 1077 gsi_insert_after (&gsi, load, GSI_NEW_STMT); 1078 1079 e = split_block (new_bb, load); 1080 new_bb = e->dest; 1081 gsi = gsi_start_bb (new_bb); 1082 ref = tmp_load; 1083 x = fold_build2 (reduc->reduction_code, 1084 TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref, 1085 PHI_RESULT (reduc->new_phi)); 1086 1087 name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true, 1088 GSI_CONTINUE_LINKING); 1089 1090 gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT); 1091 return 1; 1092 } 1093 1094 /* Create the atomic operation at the join point of the threads. 1095 REDUCTION_LIST describes the reductions in the LOOP. 1096 LD_ST_DATA describes the shared data structure where 1097 shared data is stored in and loaded from. */ 1098 static void 1099 create_call_for_reduction (struct loop *loop, htab_t reduction_list, 1100 struct clsn_data *ld_st_data) 1101 { 1102 htab_traverse (reduction_list, create_phi_for_local_result, loop); 1103 /* Find the fallthru edge from GIMPLE_OMP_CONTINUE. */ 1104 ld_st_data->load_bb = FALLTHRU_EDGE (loop->latch)->dest; 1105 htab_traverse (reduction_list, create_call_for_reduction_1, ld_st_data); 1106 } 1107 1108 /* Callback for htab_traverse. Loads the final reduction value at the 1109 join point of all threads, and inserts it in the right place. */ 1110 1111 static int 1112 create_loads_for_reductions (void **slot, void *data) 1113 { 1114 struct reduction_info *const red = (struct reduction_info *) *slot; 1115 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1116 gimple stmt; 1117 gimple_stmt_iterator gsi; 1118 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt)); 1119 tree load_struct; 1120 tree name; 1121 tree x; 1122 1123 gsi = gsi_after_labels (clsn_data->load_bb); 1124 load_struct = build_simple_mem_ref (clsn_data->load); 1125 load_struct = build3 (COMPONENT_REF, type, load_struct, red->field, 1126 NULL_TREE); 1127 1128 x = load_struct; 1129 name = PHI_RESULT (red->keep_res); 1130 stmt = gimple_build_assign (name, x); 1131 SSA_NAME_DEF_STMT (name) = stmt; 1132 1133 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1134 1135 for (gsi = gsi_start_phis (gimple_bb (red->keep_res)); 1136 !gsi_end_p (gsi); gsi_next (&gsi)) 1137 if (gsi_stmt (gsi) == red->keep_res) 1138 { 1139 remove_phi_node (&gsi, false); 1140 return 1; 1141 } 1142 gcc_unreachable (); 1143 } 1144 1145 /* Load the reduction result that was stored in LD_ST_DATA. 1146 REDUCTION_LIST describes the list of reductions that the 1147 loads should be generated for. */ 1148 static void 1149 create_final_loads_for_reduction (htab_t reduction_list, 1150 struct clsn_data *ld_st_data) 1151 { 1152 gimple_stmt_iterator gsi; 1153 tree t; 1154 gimple stmt; 1155 1156 gsi = gsi_after_labels (ld_st_data->load_bb); 1157 t = build_fold_addr_expr (ld_st_data->store); 1158 stmt = gimple_build_assign (ld_st_data->load, t); 1159 1160 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 1161 SSA_NAME_DEF_STMT (ld_st_data->load) = stmt; 1162 1163 htab_traverse (reduction_list, create_loads_for_reductions, ld_st_data); 1164 1165 } 1166 1167 /* Callback for htab_traverse. Store the neutral value for the 1168 particular reduction's operation, e.g. 0 for PLUS_EXPR, 1169 1 for MULT_EXPR, etc. into the reduction field. 1170 The reduction is specified in SLOT. The store information is 1171 passed in DATA. */ 1172 1173 static int 1174 create_stores_for_reduction (void **slot, void *data) 1175 { 1176 struct reduction_info *const red = (struct reduction_info *) *slot; 1177 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1178 tree t; 1179 gimple stmt; 1180 gimple_stmt_iterator gsi; 1181 tree type = TREE_TYPE (gimple_assign_lhs (red->reduc_stmt)); 1182 1183 gsi = gsi_last_bb (clsn_data->store_bb); 1184 t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE); 1185 stmt = gimple_build_assign (t, red->initial_value); 1186 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1187 1188 return 1; 1189 } 1190 1191 /* Callback for htab_traverse. Creates loads to a field of LOAD in LOAD_BB and 1192 store to a field of STORE in STORE_BB for the ssa name and its duplicate 1193 specified in SLOT. */ 1194 1195 static int 1196 create_loads_and_stores_for_name (void **slot, void *data) 1197 { 1198 struct name_to_copy_elt *const elt = (struct name_to_copy_elt *) *slot; 1199 struct clsn_data *const clsn_data = (struct clsn_data *) data; 1200 tree t; 1201 gimple stmt; 1202 gimple_stmt_iterator gsi; 1203 tree type = TREE_TYPE (elt->new_name); 1204 tree load_struct; 1205 1206 gsi = gsi_last_bb (clsn_data->store_bb); 1207 t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE); 1208 stmt = gimple_build_assign (t, ssa_name (elt->version)); 1209 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1210 1211 gsi = gsi_last_bb (clsn_data->load_bb); 1212 load_struct = build_simple_mem_ref (clsn_data->load); 1213 t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE); 1214 stmt = gimple_build_assign (elt->new_name, t); 1215 SSA_NAME_DEF_STMT (elt->new_name) = stmt; 1216 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1217 1218 return 1; 1219 } 1220 1221 /* Moves all the variables used in LOOP and defined outside of it (including 1222 the initial values of loop phi nodes, and *PER_THREAD if it is a ssa 1223 name) to a structure created for this purpose. The code 1224 1225 while (1) 1226 { 1227 use (a); 1228 use (b); 1229 } 1230 1231 is transformed this way: 1232 1233 bb0: 1234 old.a = a; 1235 old.b = b; 1236 1237 bb1: 1238 a' = new->a; 1239 b' = new->b; 1240 while (1) 1241 { 1242 use (a'); 1243 use (b'); 1244 } 1245 1246 `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT. The 1247 pointer `new' is intentionally not initialized (the loop will be split to a 1248 separate function later, and `new' will be initialized from its arguments). 1249 LD_ST_DATA holds information about the shared data structure used to pass 1250 information among the threads. It is initialized here, and 1251 gen_parallel_loop will pass it to create_call_for_reduction that 1252 needs this information. REDUCTION_LIST describes the reductions 1253 in LOOP. */ 1254 1255 static void 1256 separate_decls_in_region (edge entry, edge exit, htab_t reduction_list, 1257 tree *arg_struct, tree *new_arg_struct, 1258 struct clsn_data *ld_st_data) 1259 1260 { 1261 basic_block bb1 = split_edge (entry); 1262 basic_block bb0 = single_pred (bb1); 1263 htab_t name_copies = htab_create (10, name_to_copy_elt_hash, 1264 name_to_copy_elt_eq, free); 1265 htab_t decl_copies = htab_create (10, int_tree_map_hash, int_tree_map_eq, 1266 free); 1267 unsigned i; 1268 tree type, type_name, nvar; 1269 gimple_stmt_iterator gsi; 1270 struct clsn_data clsn_data; 1271 vec<basic_block> body; 1272 body.create (3); 1273 basic_block bb; 1274 basic_block entry_bb = bb1; 1275 basic_block exit_bb = exit->dest; 1276 bool has_debug_stmt = false; 1277 1278 entry = single_succ_edge (entry_bb); 1279 gather_blocks_in_sese_region (entry_bb, exit_bb, &body); 1280 1281 FOR_EACH_VEC_ELT (body, i, bb) 1282 { 1283 if (bb != entry_bb && bb != exit_bb) 1284 { 1285 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1286 separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi), 1287 name_copies, decl_copies); 1288 1289 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1290 { 1291 gimple stmt = gsi_stmt (gsi); 1292 1293 if (is_gimple_debug (stmt)) 1294 has_debug_stmt = true; 1295 else 1296 separate_decls_in_region_stmt (entry, exit, stmt, 1297 name_copies, decl_copies); 1298 } 1299 } 1300 } 1301 1302 /* Now process debug bind stmts. We must not create decls while 1303 processing debug stmts, so we defer their processing so as to 1304 make sure we will have debug info for as many variables as 1305 possible (all of those that were dealt with in the loop above), 1306 and discard those for which we know there's nothing we can 1307 do. */ 1308 if (has_debug_stmt) 1309 FOR_EACH_VEC_ELT (body, i, bb) 1310 if (bb != entry_bb && bb != exit_bb) 1311 { 1312 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);) 1313 { 1314 gimple stmt = gsi_stmt (gsi); 1315 1316 if (is_gimple_debug (stmt)) 1317 { 1318 if (separate_decls_in_region_debug (stmt, name_copies, 1319 decl_copies)) 1320 { 1321 gsi_remove (&gsi, true); 1322 continue; 1323 } 1324 } 1325 1326 gsi_next (&gsi); 1327 } 1328 } 1329 1330 body.release (); 1331 1332 if (htab_elements (name_copies) == 0 && htab_elements (reduction_list) == 0) 1333 { 1334 /* It may happen that there is nothing to copy (if there are only 1335 loop carried and external variables in the loop). */ 1336 *arg_struct = NULL; 1337 *new_arg_struct = NULL; 1338 } 1339 else 1340 { 1341 /* Create the type for the structure to store the ssa names to. */ 1342 type = lang_hooks.types.make_type (RECORD_TYPE); 1343 type_name = build_decl (UNKNOWN_LOCATION, 1344 TYPE_DECL, create_tmp_var_name (".paral_data"), 1345 type); 1346 TYPE_NAME (type) = type_name; 1347 1348 htab_traverse (name_copies, add_field_for_name, type); 1349 if (reduction_list && htab_elements (reduction_list) > 0) 1350 { 1351 /* Create the fields for reductions. */ 1352 htab_traverse (reduction_list, add_field_for_reduction, 1353 type); 1354 } 1355 layout_type (type); 1356 1357 /* Create the loads and stores. */ 1358 *arg_struct = create_tmp_var (type, ".paral_data_store"); 1359 nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load"); 1360 *new_arg_struct = make_ssa_name (nvar, NULL); 1361 1362 ld_st_data->store = *arg_struct; 1363 ld_st_data->load = *new_arg_struct; 1364 ld_st_data->store_bb = bb0; 1365 ld_st_data->load_bb = bb1; 1366 1367 htab_traverse (name_copies, create_loads_and_stores_for_name, 1368 ld_st_data); 1369 1370 /* Load the calculation from memory (after the join of the threads). */ 1371 1372 if (reduction_list && htab_elements (reduction_list) > 0) 1373 { 1374 htab_traverse (reduction_list, create_stores_for_reduction, 1375 ld_st_data); 1376 clsn_data.load = make_ssa_name (nvar, NULL); 1377 clsn_data.load_bb = exit->dest; 1378 clsn_data.store = ld_st_data->store; 1379 create_final_loads_for_reduction (reduction_list, &clsn_data); 1380 } 1381 } 1382 1383 htab_delete (decl_copies); 1384 htab_delete (name_copies); 1385 } 1386 1387 /* Bitmap containing uids of functions created by parallelization. We cannot 1388 allocate it from the default obstack, as it must live across compilation 1389 of several functions; we make it gc allocated instead. */ 1390 1391 static GTY(()) bitmap parallelized_functions; 1392 1393 /* Returns true if FN was created by create_loop_fn. */ 1394 1395 bool 1396 parallelized_function_p (tree fn) 1397 { 1398 if (!parallelized_functions || !DECL_ARTIFICIAL (fn)) 1399 return false; 1400 1401 return bitmap_bit_p (parallelized_functions, DECL_UID (fn)); 1402 } 1403 1404 /* Creates and returns an empty function that will receive the body of 1405 a parallelized loop. */ 1406 1407 static tree 1408 create_loop_fn (location_t loc) 1409 { 1410 char buf[100]; 1411 char *tname; 1412 tree decl, type, name, t; 1413 struct function *act_cfun = cfun; 1414 static unsigned loopfn_num; 1415 1416 loc = LOCATION_LOCUS (loc); 1417 snprintf (buf, 100, "%s.$loopfn", current_function_name ()); 1418 ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++); 1419 clean_symbol_name (tname); 1420 name = get_identifier (tname); 1421 type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE); 1422 1423 decl = build_decl (loc, FUNCTION_DECL, name, type); 1424 if (!parallelized_functions) 1425 parallelized_functions = BITMAP_GGC_ALLOC (); 1426 bitmap_set_bit (parallelized_functions, DECL_UID (decl)); 1427 1428 TREE_STATIC (decl) = 1; 1429 TREE_USED (decl) = 1; 1430 DECL_ARTIFICIAL (decl) = 1; 1431 DECL_IGNORED_P (decl) = 0; 1432 TREE_PUBLIC (decl) = 0; 1433 DECL_UNINLINABLE (decl) = 1; 1434 DECL_EXTERNAL (decl) = 0; 1435 DECL_CONTEXT (decl) = NULL_TREE; 1436 DECL_INITIAL (decl) = make_node (BLOCK); 1437 1438 t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node); 1439 DECL_ARTIFICIAL (t) = 1; 1440 DECL_IGNORED_P (t) = 1; 1441 DECL_RESULT (decl) = t; 1442 1443 t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"), 1444 ptr_type_node); 1445 DECL_ARTIFICIAL (t) = 1; 1446 DECL_ARG_TYPE (t) = ptr_type_node; 1447 DECL_CONTEXT (t) = decl; 1448 TREE_USED (t) = 1; 1449 DECL_ARGUMENTS (decl) = t; 1450 1451 allocate_struct_function (decl, false); 1452 1453 /* The call to allocate_struct_function clobbers CFUN, so we need to restore 1454 it. */ 1455 set_cfun (act_cfun); 1456 1457 return decl; 1458 } 1459 1460 /* Moves the exit condition of LOOP to the beginning of its header, and 1461 duplicates the part of the last iteration that gets disabled to the 1462 exit of the loop. NIT is the number of iterations of the loop 1463 (used to initialize the variables in the duplicated part). 1464 1465 TODO: the common case is that latch of the loop is empty and immediately 1466 follows the loop exit. In this case, it would be better not to copy the 1467 body of the loop, but only move the entry of the loop directly before the 1468 exit check and increase the number of iterations of the loop by one. 1469 This may need some additional preconditioning in case NIT = ~0. 1470 REDUCTION_LIST describes the reductions in LOOP. */ 1471 1472 static void 1473 transform_to_exit_first_loop (struct loop *loop, htab_t reduction_list, tree nit) 1474 { 1475 basic_block *bbs, *nbbs, ex_bb, orig_header; 1476 unsigned n; 1477 bool ok; 1478 edge exit = single_dom_exit (loop), hpred; 1479 tree control, control_name, res, t; 1480 gimple phi, nphi, cond_stmt, stmt, cond_nit; 1481 gimple_stmt_iterator gsi; 1482 tree nit_1; 1483 1484 split_block_after_labels (loop->header); 1485 orig_header = single_succ (loop->header); 1486 hpred = single_succ_edge (loop->header); 1487 1488 cond_stmt = last_stmt (exit->src); 1489 control = gimple_cond_lhs (cond_stmt); 1490 gcc_assert (gimple_cond_rhs (cond_stmt) == nit); 1491 1492 /* Make sure that we have phi nodes on exit for all loop header phis 1493 (create_parallel_loop requires that). */ 1494 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 1495 { 1496 phi = gsi_stmt (gsi); 1497 res = PHI_RESULT (phi); 1498 t = copy_ssa_name (res, phi); 1499 SET_PHI_RESULT (phi, t); 1500 nphi = create_phi_node (res, orig_header); 1501 add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION); 1502 1503 if (res == control) 1504 { 1505 gimple_cond_set_lhs (cond_stmt, t); 1506 update_stmt (cond_stmt); 1507 control = t; 1508 } 1509 } 1510 1511 bbs = get_loop_body_in_dom_order (loop); 1512 1513 for (n = 0; bbs[n] != exit->src; n++) 1514 continue; 1515 nbbs = XNEWVEC (basic_block, n); 1516 ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit, 1517 bbs + 1, n, nbbs); 1518 gcc_assert (ok); 1519 free (bbs); 1520 ex_bb = nbbs[0]; 1521 free (nbbs); 1522 1523 /* Other than reductions, the only gimple reg that should be copied 1524 out of the loop is the control variable. */ 1525 exit = single_dom_exit (loop); 1526 control_name = NULL_TREE; 1527 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); ) 1528 { 1529 phi = gsi_stmt (gsi); 1530 res = PHI_RESULT (phi); 1531 if (virtual_operand_p (res)) 1532 { 1533 gsi_next (&gsi); 1534 continue; 1535 } 1536 1537 /* Check if it is a part of reduction. If it is, 1538 keep the phi at the reduction's keep_res field. The 1539 PHI_RESULT of this phi is the resulting value of the reduction 1540 variable when exiting the loop. */ 1541 1542 if (htab_elements (reduction_list) > 0) 1543 { 1544 struct reduction_info *red; 1545 1546 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 1547 red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val)); 1548 if (red) 1549 { 1550 red->keep_res = phi; 1551 gsi_next (&gsi); 1552 continue; 1553 } 1554 } 1555 gcc_assert (control_name == NULL_TREE 1556 && SSA_NAME_VAR (res) == SSA_NAME_VAR (control)); 1557 control_name = res; 1558 remove_phi_node (&gsi, false); 1559 } 1560 gcc_assert (control_name != NULL_TREE); 1561 1562 /* Initialize the control variable to number of iterations 1563 according to the rhs of the exit condition. */ 1564 gsi = gsi_after_labels (ex_bb); 1565 cond_nit = last_stmt (exit->src); 1566 nit_1 = gimple_cond_rhs (cond_nit); 1567 nit_1 = force_gimple_operand_gsi (&gsi, 1568 fold_convert (TREE_TYPE (control_name), nit_1), 1569 false, NULL_TREE, false, GSI_SAME_STMT); 1570 stmt = gimple_build_assign (control_name, nit_1); 1571 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 1572 SSA_NAME_DEF_STMT (control_name) = stmt; 1573 } 1574 1575 /* Create the parallel constructs for LOOP as described in gen_parallel_loop. 1576 LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL. 1577 NEW_DATA is the variable that should be initialized from the argument 1578 of LOOP_FN. N_THREADS is the requested number of threads. Returns the 1579 basic block containing GIMPLE_OMP_PARALLEL tree. */ 1580 1581 static basic_block 1582 create_parallel_loop (struct loop *loop, tree loop_fn, tree data, 1583 tree new_data, unsigned n_threads, location_t loc) 1584 { 1585 gimple_stmt_iterator gsi; 1586 basic_block bb, paral_bb, for_bb, ex_bb; 1587 tree t, param; 1588 gimple stmt, for_stmt, phi, cond_stmt; 1589 tree cvar, cvar_init, initvar, cvar_next, cvar_base, type; 1590 edge exit, nexit, guard, end, e; 1591 1592 /* Prepare the GIMPLE_OMP_PARALLEL statement. */ 1593 bb = loop_preheader_edge (loop)->src; 1594 paral_bb = single_pred (bb); 1595 gsi = gsi_last_bb (paral_bb); 1596 1597 t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); 1598 OMP_CLAUSE_NUM_THREADS_EXPR (t) 1599 = build_int_cst (integer_type_node, n_threads); 1600 stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); 1601 gimple_set_location (stmt, loc); 1602 1603 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1604 1605 /* Initialize NEW_DATA. */ 1606 if (data) 1607 { 1608 gsi = gsi_after_labels (bb); 1609 1610 param = make_ssa_name (DECL_ARGUMENTS (loop_fn), NULL); 1611 stmt = gimple_build_assign (param, build_fold_addr_expr (data)); 1612 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1613 SSA_NAME_DEF_STMT (param) = stmt; 1614 1615 stmt = gimple_build_assign (new_data, 1616 fold_convert (TREE_TYPE (new_data), param)); 1617 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1618 SSA_NAME_DEF_STMT (new_data) = stmt; 1619 } 1620 1621 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ 1622 bb = split_loop_exit_edge (single_dom_exit (loop)); 1623 gsi = gsi_last_bb (bb); 1624 stmt = gimple_build_omp_return (false); 1625 gimple_set_location (stmt, loc); 1626 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1627 1628 /* Extract data for GIMPLE_OMP_FOR. */ 1629 gcc_assert (loop->header == single_dom_exit (loop)->src); 1630 cond_stmt = last_stmt (loop->header); 1631 1632 cvar = gimple_cond_lhs (cond_stmt); 1633 cvar_base = SSA_NAME_VAR (cvar); 1634 phi = SSA_NAME_DEF_STMT (cvar); 1635 cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); 1636 initvar = copy_ssa_name (cvar, NULL); 1637 SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)), 1638 initvar); 1639 cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); 1640 1641 gsi = gsi_last_nondebug_bb (loop->latch); 1642 gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next)); 1643 gsi_remove (&gsi, true); 1644 1645 /* Prepare cfg. */ 1646 for_bb = split_edge (loop_preheader_edge (loop)); 1647 ex_bb = split_loop_exit_edge (single_dom_exit (loop)); 1648 extract_true_false_edges_from_block (loop->header, &nexit, &exit); 1649 gcc_assert (exit == single_dom_exit (loop)); 1650 1651 guard = make_edge (for_bb, ex_bb, 0); 1652 single_succ_edge (loop->latch)->flags = 0; 1653 end = make_edge (loop->latch, ex_bb, EDGE_FALLTHRU); 1654 for (gsi = gsi_start_phis (ex_bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1655 { 1656 source_location locus; 1657 tree def; 1658 phi = gsi_stmt (gsi); 1659 stmt = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, exit)); 1660 1661 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop)); 1662 locus = gimple_phi_arg_location_from_edge (stmt, 1663 loop_preheader_edge (loop)); 1664 add_phi_arg (phi, def, guard, locus); 1665 1666 def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop)); 1667 locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop)); 1668 add_phi_arg (phi, def, end, locus); 1669 } 1670 e = redirect_edge_and_branch (exit, nexit->dest); 1671 PENDING_STMT (e) = NULL; 1672 1673 /* Emit GIMPLE_OMP_FOR. */ 1674 gimple_cond_set_lhs (cond_stmt, cvar_base); 1675 type = TREE_TYPE (cvar); 1676 t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); 1677 OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; 1678 1679 for_stmt = gimple_build_omp_for (NULL, t, 1, NULL); 1680 gimple_set_location (for_stmt, loc); 1681 gimple_omp_for_set_index (for_stmt, 0, initvar); 1682 gimple_omp_for_set_initial (for_stmt, 0, cvar_init); 1683 gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt)); 1684 gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt)); 1685 gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type, 1686 cvar_base, 1687 build_int_cst (type, 1))); 1688 1689 gsi = gsi_last_bb (for_bb); 1690 gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT); 1691 SSA_NAME_DEF_STMT (initvar) = for_stmt; 1692 1693 /* Emit GIMPLE_OMP_CONTINUE. */ 1694 gsi = gsi_last_bb (loop->latch); 1695 stmt = gimple_build_omp_continue (cvar_next, cvar); 1696 gimple_set_location (stmt, loc); 1697 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1698 SSA_NAME_DEF_STMT (cvar_next) = stmt; 1699 1700 /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR. */ 1701 gsi = gsi_last_bb (ex_bb); 1702 stmt = gimple_build_omp_return (true); 1703 gimple_set_location (stmt, loc); 1704 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 1705 1706 /* After the above dom info is hosed. Re-compute it. */ 1707 free_dominance_info (CDI_DOMINATORS); 1708 calculate_dominance_info (CDI_DOMINATORS); 1709 1710 return paral_bb; 1711 } 1712 1713 /* Generates code to execute the iterations of LOOP in N_THREADS 1714 threads in parallel. 1715 1716 NITER describes number of iterations of LOOP. 1717 REDUCTION_LIST describes the reductions existent in the LOOP. */ 1718 1719 static void 1720 gen_parallel_loop (struct loop *loop, htab_t reduction_list, 1721 unsigned n_threads, struct tree_niter_desc *niter) 1722 { 1723 loop_iterator li; 1724 tree many_iterations_cond, type, nit; 1725 tree arg_struct, new_arg_struct; 1726 gimple_seq stmts; 1727 basic_block parallel_head; 1728 edge entry, exit; 1729 struct clsn_data clsn_data; 1730 unsigned prob; 1731 location_t loc; 1732 gimple cond_stmt; 1733 unsigned int m_p_thread=2; 1734 1735 /* From 1736 1737 --------------------------------------------------------------------- 1738 loop 1739 { 1740 IV = phi (INIT, IV + STEP) 1741 BODY1; 1742 if (COND) 1743 break; 1744 BODY2; 1745 } 1746 --------------------------------------------------------------------- 1747 1748 with # of iterations NITER (possibly with MAY_BE_ZERO assumption), 1749 we generate the following code: 1750 1751 --------------------------------------------------------------------- 1752 1753 if (MAY_BE_ZERO 1754 || NITER < MIN_PER_THREAD * N_THREADS) 1755 goto original; 1756 1757 BODY1; 1758 store all local loop-invariant variables used in body of the loop to DATA. 1759 GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA); 1760 load the variables from DATA. 1761 GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static)) 1762 BODY2; 1763 BODY1; 1764 GIMPLE_OMP_CONTINUE; 1765 GIMPLE_OMP_RETURN -- GIMPLE_OMP_FOR 1766 GIMPLE_OMP_RETURN -- GIMPLE_OMP_PARALLEL 1767 goto end; 1768 1769 original: 1770 loop 1771 { 1772 IV = phi (INIT, IV + STEP) 1773 BODY1; 1774 if (COND) 1775 break; 1776 BODY2; 1777 } 1778 1779 end: 1780 1781 */ 1782 1783 /* Create two versions of the loop -- in the old one, we know that the 1784 number of iterations is large enough, and we will transform it into the 1785 loop that will be split to loop_fn, the new one will be used for the 1786 remaining iterations. */ 1787 1788 /* We should compute a better number-of-iterations value for outer loops. 1789 That is, if we have 1790 1791 for (i = 0; i < n; ++i) 1792 for (j = 0; j < m; ++j) 1793 ... 1794 1795 we should compute nit = n * m, not nit = n. 1796 Also may_be_zero handling would need to be adjusted. */ 1797 1798 type = TREE_TYPE (niter->niter); 1799 nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true, 1800 NULL_TREE); 1801 if (stmts) 1802 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1803 1804 if (loop->inner) 1805 m_p_thread=2; 1806 else 1807 m_p_thread=MIN_PER_THREAD; 1808 1809 many_iterations_cond = 1810 fold_build2 (GE_EXPR, boolean_type_node, 1811 nit, build_int_cst (type, m_p_thread * n_threads)); 1812 1813 many_iterations_cond 1814 = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 1815 invert_truthvalue (unshare_expr (niter->may_be_zero)), 1816 many_iterations_cond); 1817 many_iterations_cond 1818 = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); 1819 if (stmts) 1820 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1821 if (!is_gimple_condexpr (many_iterations_cond)) 1822 { 1823 many_iterations_cond 1824 = force_gimple_operand (many_iterations_cond, &stmts, 1825 true, NULL_TREE); 1826 if (stmts) 1827 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 1828 } 1829 1830 initialize_original_copy_tables (); 1831 1832 /* We assume that the loop usually iterates a lot. */ 1833 prob = 4 * REG_BR_PROB_BASE / 5; 1834 loop_version (loop, many_iterations_cond, NULL, 1835 prob, prob, REG_BR_PROB_BASE - prob, true); 1836 update_ssa (TODO_update_ssa); 1837 free_original_copy_tables (); 1838 1839 /* Base all the induction variables in LOOP on a single control one. */ 1840 canonicalize_loop_ivs (loop, &nit, true); 1841 1842 /* Ensure that the exit condition is the first statement in the loop. */ 1843 transform_to_exit_first_loop (loop, reduction_list, nit); 1844 1845 /* Generate initializations for reductions. */ 1846 if (htab_elements (reduction_list) > 0) 1847 htab_traverse (reduction_list, initialize_reductions, loop); 1848 1849 /* Eliminate the references to local variables from the loop. */ 1850 gcc_assert (single_exit (loop)); 1851 entry = loop_preheader_edge (loop); 1852 exit = single_dom_exit (loop); 1853 1854 eliminate_local_variables (entry, exit); 1855 /* In the old loop, move all variables non-local to the loop to a structure 1856 and back, and create separate decls for the variables used in loop. */ 1857 separate_decls_in_region (entry, exit, reduction_list, &arg_struct, 1858 &new_arg_struct, &clsn_data); 1859 1860 /* Create the parallel constructs. */ 1861 loc = UNKNOWN_LOCATION; 1862 cond_stmt = last_stmt (loop->header); 1863 if (cond_stmt) 1864 loc = gimple_location (cond_stmt); 1865 parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct, 1866 new_arg_struct, n_threads, loc); 1867 if (htab_elements (reduction_list) > 0) 1868 create_call_for_reduction (loop, reduction_list, &clsn_data); 1869 1870 scev_reset (); 1871 1872 /* Cancel the loop (it is simpler to do it here rather than to teach the 1873 expander to do it). */ 1874 cancel_loop_tree (loop); 1875 1876 /* Free loop bound estimations that could contain references to 1877 removed statements. */ 1878 FOR_EACH_LOOP (li, loop, 0) 1879 free_numbers_of_iterations_estimates_loop (loop); 1880 1881 /* Expand the parallel constructs. We do it directly here instead of running 1882 a separate expand_omp pass, since it is more efficient, and less likely to 1883 cause troubles with further analyses not being able to deal with the 1884 OMP trees. */ 1885 1886 omp_expand_local (parallel_head); 1887 } 1888 1889 /* Returns true when LOOP contains vector phi nodes. */ 1890 1891 static bool 1892 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED) 1893 { 1894 unsigned i; 1895 basic_block *bbs = get_loop_body_in_dom_order (loop); 1896 gimple_stmt_iterator gsi; 1897 bool res = true; 1898 1899 for (i = 0; i < loop->num_nodes; i++) 1900 for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi)) 1901 if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi_stmt (gsi)))) == VECTOR_TYPE) 1902 goto end; 1903 1904 res = false; 1905 end: 1906 free (bbs); 1907 return res; 1908 } 1909 1910 /* Create a reduction_info struct, initialize it with REDUC_STMT 1911 and PHI, insert it to the REDUCTION_LIST. */ 1912 1913 static void 1914 build_new_reduction (htab_t reduction_list, gimple reduc_stmt, gimple phi) 1915 { 1916 PTR *slot; 1917 struct reduction_info *new_reduction; 1918 1919 gcc_assert (reduc_stmt); 1920 1921 if (dump_file && (dump_flags & TDF_DETAILS)) 1922 { 1923 fprintf (dump_file, 1924 "Detected reduction. reduction stmt is: \n"); 1925 print_gimple_stmt (dump_file, reduc_stmt, 0, 0); 1926 fprintf (dump_file, "\n"); 1927 } 1928 1929 new_reduction = XCNEW (struct reduction_info); 1930 1931 new_reduction->reduc_stmt = reduc_stmt; 1932 new_reduction->reduc_phi = phi; 1933 new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi)); 1934 new_reduction->reduction_code = gimple_assign_rhs_code (reduc_stmt); 1935 slot = htab_find_slot (reduction_list, new_reduction, INSERT); 1936 *slot = new_reduction; 1937 } 1938 1939 /* Callback for htab_traverse. Sets gimple_uid of reduc_phi stmts. */ 1940 1941 static int 1942 set_reduc_phi_uids (void **slot, void *data ATTRIBUTE_UNUSED) 1943 { 1944 struct reduction_info *const red = (struct reduction_info *) *slot; 1945 gimple_set_uid (red->reduc_phi, red->reduc_version); 1946 return 1; 1947 } 1948 1949 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST. */ 1950 1951 static void 1952 gather_scalar_reductions (loop_p loop, htab_t reduction_list) 1953 { 1954 gimple_stmt_iterator gsi; 1955 loop_vec_info simple_loop_info; 1956 1957 simple_loop_info = vect_analyze_loop_form (loop); 1958 1959 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 1960 { 1961 gimple phi = gsi_stmt (gsi); 1962 affine_iv iv; 1963 tree res = PHI_RESULT (phi); 1964 bool double_reduc; 1965 1966 if (virtual_operand_p (res)) 1967 continue; 1968 1969 if (!simple_iv (loop, loop, res, &iv, true) 1970 && simple_loop_info) 1971 { 1972 gimple reduc_stmt = vect_force_simple_reduction (simple_loop_info, 1973 phi, true, 1974 &double_reduc); 1975 if (reduc_stmt && !double_reduc) 1976 build_new_reduction (reduction_list, reduc_stmt, phi); 1977 } 1978 } 1979 destroy_loop_vec_info (simple_loop_info, true); 1980 1981 /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form 1982 and destroy_loop_vec_info, we can set gimple_uid of reduc_phi stmts 1983 only now. */ 1984 htab_traverse (reduction_list, set_reduc_phi_uids, NULL); 1985 } 1986 1987 /* Try to initialize NITER for code generation part. */ 1988 1989 static bool 1990 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter) 1991 { 1992 edge exit = single_dom_exit (loop); 1993 1994 gcc_assert (exit); 1995 1996 /* We need to know # of iterations, and there should be no uses of values 1997 defined inside loop outside of it, unless the values are invariants of 1998 the loop. */ 1999 if (!number_of_iterations_exit (loop, exit, niter, false)) 2000 { 2001 if (dump_file && (dump_flags & TDF_DETAILS)) 2002 fprintf (dump_file, " FAILED: number of iterations not known\n"); 2003 return false; 2004 } 2005 2006 return true; 2007 } 2008 2009 /* Try to initialize REDUCTION_LIST for code generation part. 2010 REDUCTION_LIST describes the reductions. */ 2011 2012 static bool 2013 try_create_reduction_list (loop_p loop, htab_t reduction_list) 2014 { 2015 edge exit = single_dom_exit (loop); 2016 gimple_stmt_iterator gsi; 2017 2018 gcc_assert (exit); 2019 2020 gather_scalar_reductions (loop, reduction_list); 2021 2022 2023 for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 2024 { 2025 gimple phi = gsi_stmt (gsi); 2026 struct reduction_info *red; 2027 imm_use_iterator imm_iter; 2028 use_operand_p use_p; 2029 gimple reduc_phi; 2030 tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit); 2031 2032 if (!virtual_operand_p (val)) 2033 { 2034 if (dump_file && (dump_flags & TDF_DETAILS)) 2035 { 2036 fprintf (dump_file, "phi is "); 2037 print_gimple_stmt (dump_file, phi, 0, 0); 2038 fprintf (dump_file, "arg of phi to exit: value "); 2039 print_generic_expr (dump_file, val, 0); 2040 fprintf (dump_file, " used outside loop\n"); 2041 fprintf (dump_file, 2042 " checking if it a part of reduction pattern: \n"); 2043 } 2044 if (htab_elements (reduction_list) == 0) 2045 { 2046 if (dump_file && (dump_flags & TDF_DETAILS)) 2047 fprintf (dump_file, 2048 " FAILED: it is not a part of reduction.\n"); 2049 return false; 2050 } 2051 reduc_phi = NULL; 2052 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val) 2053 { 2054 if (!gimple_debug_bind_p (USE_STMT (use_p)) 2055 && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 2056 { 2057 reduc_phi = USE_STMT (use_p); 2058 break; 2059 } 2060 } 2061 red = reduction_phi (reduction_list, reduc_phi); 2062 if (red == NULL) 2063 { 2064 if (dump_file && (dump_flags & TDF_DETAILS)) 2065 fprintf (dump_file, 2066 " FAILED: it is not a part of reduction.\n"); 2067 return false; 2068 } 2069 if (dump_file && (dump_flags & TDF_DETAILS)) 2070 { 2071 fprintf (dump_file, "reduction phi is "); 2072 print_gimple_stmt (dump_file, red->reduc_phi, 0, 0); 2073 fprintf (dump_file, "reduction stmt is "); 2074 print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0); 2075 } 2076 } 2077 } 2078 2079 /* The iterations of the loop may communicate only through bivs whose 2080 iteration space can be distributed efficiently. */ 2081 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 2082 { 2083 gimple phi = gsi_stmt (gsi); 2084 tree def = PHI_RESULT (phi); 2085 affine_iv iv; 2086 2087 if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true)) 2088 { 2089 struct reduction_info *red; 2090 2091 red = reduction_phi (reduction_list, phi); 2092 if (red == NULL) 2093 { 2094 if (dump_file && (dump_flags & TDF_DETAILS)) 2095 fprintf (dump_file, 2096 " FAILED: scalar dependency between iterations\n"); 2097 return false; 2098 } 2099 } 2100 } 2101 2102 2103 return true; 2104 } 2105 2106 /* Detect parallel loops and generate parallel code using libgomp 2107 primitives. Returns true if some loop was parallelized, false 2108 otherwise. */ 2109 2110 bool 2111 parallelize_loops (void) 2112 { 2113 unsigned n_threads = flag_tree_parallelize_loops; 2114 bool changed = false; 2115 struct loop *loop; 2116 struct tree_niter_desc niter_desc; 2117 loop_iterator li; 2118 htab_t reduction_list; 2119 struct obstack parloop_obstack; 2120 HOST_WIDE_INT estimated; 2121 LOC loop_loc; 2122 2123 /* Do not parallelize loops in the functions created by parallelization. */ 2124 if (parallelized_function_p (cfun->decl)) 2125 return false; 2126 if (cfun->has_nonlocal_label) 2127 return false; 2128 2129 gcc_obstack_init (&parloop_obstack); 2130 reduction_list = htab_create (10, reduction_info_hash, 2131 reduction_info_eq, free); 2132 init_stmt_vec_info_vec (); 2133 2134 FOR_EACH_LOOP (li, loop, 0) 2135 { 2136 htab_empty (reduction_list); 2137 if (dump_file && (dump_flags & TDF_DETAILS)) 2138 { 2139 fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); 2140 if (loop->inner) 2141 fprintf (dump_file, "loop %d is not innermost\n",loop->num); 2142 else 2143 fprintf (dump_file, "loop %d is innermost\n",loop->num); 2144 } 2145 2146 /* If we use autopar in graphite pass, we use its marked dependency 2147 checking results. */ 2148 if (flag_loop_parallelize_all && !loop->can_be_parallel) 2149 { 2150 if (dump_file && (dump_flags & TDF_DETAILS)) 2151 fprintf (dump_file, "loop is not parallel according to graphite\n"); 2152 continue; 2153 } 2154 2155 if (!single_dom_exit (loop)) 2156 { 2157 2158 if (dump_file && (dump_flags & TDF_DETAILS)) 2159 fprintf (dump_file, "loop is !single_dom_exit\n"); 2160 2161 continue; 2162 } 2163 2164 if (/* And of course, the loop must be parallelizable. */ 2165 !can_duplicate_loop_p (loop) 2166 || loop_has_blocks_with_irreducible_flag (loop) 2167 || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP) 2168 /* FIXME: the check for vector phi nodes could be removed. */ 2169 || loop_has_vector_phi_nodes (loop)) 2170 continue; 2171 2172 estimated = estimated_stmt_executions_int (loop); 2173 if (estimated == -1) 2174 estimated = max_stmt_executions_int (loop); 2175 /* FIXME: Bypass this check as graphite doesn't update the 2176 count and frequency correctly now. */ 2177 if (!flag_loop_parallelize_all 2178 && ((estimated != -1 2179 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) 2180 /* Do not bother with loops in cold areas. */ 2181 || optimize_loop_nest_for_size_p (loop))) 2182 continue; 2183 2184 if (!try_get_loop_niter (loop, &niter_desc)) 2185 continue; 2186 2187 if (!try_create_reduction_list (loop, reduction_list)) 2188 continue; 2189 2190 if (!flag_loop_parallelize_all 2191 && !loop_parallel_p (loop, &parloop_obstack)) 2192 continue; 2193 2194 changed = true; 2195 if (dump_file && (dump_flags & TDF_DETAILS)) 2196 { 2197 if (loop->inner) 2198 fprintf (dump_file, "parallelizing outer loop %d\n",loop->header->index); 2199 else 2200 fprintf (dump_file, "parallelizing inner loop %d\n",loop->header->index); 2201 loop_loc = find_loop_location (loop); 2202 if (loop_loc != UNKNOWN_LOC) 2203 fprintf (dump_file, "\nloop at %s:%d: ", 2204 LOC_FILE (loop_loc), LOC_LINE (loop_loc)); 2205 } 2206 gen_parallel_loop (loop, reduction_list, 2207 n_threads, &niter_desc); 2208 #ifdef ENABLE_CHECKING 2209 verify_flow_info (); 2210 verify_loop_structure (); 2211 verify_loop_closed_ssa (true); 2212 #endif 2213 } 2214 2215 free_stmt_vec_info_vec (); 2216 htab_delete (reduction_list); 2217 obstack_free (&parloop_obstack, NULL); 2218 2219 /* Parallelization will cause new function calls to be inserted through 2220 which local variables will escape. Reset the points-to solution 2221 for ESCAPED. */ 2222 if (changed) 2223 pt_solution_reset (&cfun->gimple_df->escaped); 2224 2225 return changed; 2226 } 2227 2228 #include "gt-tree-parloops.h" 2229