1 /* Loop unroll-and-jam. 2 Copyright (C) 2017-2020 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify it 7 under the terms of the GNU General Public License as published by the 8 Free Software Foundation; either version 3, or (at your option) any 9 later version. 10 11 GCC is distributed in the hope that it will be useful, but WITHOUT 12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GCC; see the file COPYING3. If not see 18 <http://www.gnu.org/licenses/>. */ 19 20 #include "config.h" 21 #include "system.h" 22 #include "coretypes.h" 23 #include "tree-pass.h" 24 #include "backend.h" 25 #include "tree.h" 26 #include "gimple.h" 27 #include "ssa.h" 28 #include "fold-const.h" 29 #include "tree-cfg.h" 30 #include "tree-ssa.h" 31 #include "tree-ssa-loop-niter.h" 32 #include "tree-ssa-loop.h" 33 #include "tree-ssa-loop-manip.h" 34 #include "cfgloop.h" 35 #include "tree-scalar-evolution.h" 36 #include "gimple-iterator.h" 37 #include "cfghooks.h" 38 #include "tree-data-ref.h" 39 #include "tree-ssa-loop-ivopts.h" 40 #include "tree-vectorizer.h" 41 42 /* Unroll and Jam transformation 43 44 This is a combination of two transformations, where the second 45 is not always valid. It's applicable if a loop nest has redundancies 46 over the iterations of an outer loop while not having that with 47 an inner loop. 48 49 Given this nest: 50 for (i) { 51 for (j) { 52 B(i,j) 53 } 54 } 55 56 first unroll: 57 for (i by 2) { 58 for (j) { 59 B(i,j) 60 } 61 for (j) { 62 B(i+1,j) 63 } 64 } 65 66 then fuse the two adjacent inner loops resulting from that: 67 for (i by 2) { 68 for (j) { 69 B(i,j) 70 B(i+1,j) 71 } 72 } 73 74 As the order of evaluations of the body B changes this is valid 75 only in certain situations: all distance vectors need to be forward. 76 Additionally if there are multiple induction variables than just 77 a counting control IV (j above) we can also deal with some situations. 78 79 The validity is checked by unroll_jam_possible_p, and the data-dep 80 testing below. 81 82 A trivial example where the fusion is wrong would be when 83 B(i,j) == x[j-1] = x[j]; 84 for (i by 2) { 85 for (j) { 86 x[j-1] = x[j]; 87 } 88 for (j) { 89 x[j-1] = x[j]; 90 } 91 } effect: move content to front by two elements 92 --> 93 for (i by 2) { 94 for (j) { 95 x[j-1] = x[j]; 96 x[j-1] = x[j]; 97 } 98 } effect: move content to front by one element 99 */ 100 101 /* Modify the loop tree for the fact that all code once belonging 102 to the OLD loop or the outer loop of OLD now is inside LOOP. */ 103 104 static void 105 merge_loop_tree (class loop *loop, class loop *old) 106 { 107 basic_block *bbs; 108 int i, n; 109 class loop *subloop; 110 edge e; 111 edge_iterator ei; 112 113 /* Find its nodes. */ 114 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 115 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); 116 117 for (i = 0; i < n; i++) 118 { 119 /* If the block was direct child of OLD loop it's now part 120 of LOOP. If it was outside OLD, then it moved into LOOP 121 as well. This avoids changing the loop father for BBs 122 in inner loops of OLD. */ 123 if (bbs[i]->loop_father == old 124 || loop_depth (bbs[i]->loop_father) < loop_depth (old)) 125 { 126 remove_bb_from_loops (bbs[i]); 127 add_bb_to_loop (bbs[i], loop); 128 continue; 129 } 130 131 /* If we find a direct subloop of OLD, move it to LOOP. */ 132 subloop = bbs[i]->loop_father; 133 if (loop_outer (subloop) == old && subloop->header == bbs[i]) 134 { 135 flow_loop_tree_node_remove (subloop); 136 flow_loop_tree_node_add (loop, subloop); 137 } 138 } 139 140 /* Update the information about loop exit edges. */ 141 for (i = 0; i < n; i++) 142 { 143 FOR_EACH_EDGE (e, ei, bbs[i]->succs) 144 { 145 rescan_loop_exit (e, false, false); 146 } 147 } 148 149 loop->num_nodes = n; 150 151 free (bbs); 152 } 153 154 /* BB is part of the outer loop of an unroll-and-jam situation. 155 Check if any statements therein would prevent the transformation. */ 156 157 static bool 158 bb_prevents_fusion_p (basic_block bb) 159 { 160 gimple_stmt_iterator gsi; 161 /* BB is duplicated by outer unrolling and then all N-1 first copies 162 move into the body of the fused inner loop. If BB exits the outer loop 163 the last copy still does so, and the first N-1 copies are cancelled 164 by loop unrolling, so also after fusion it's the exit block. 165 But there might be other reasons that prevent fusion: 166 * stores or unknown side-effects prevent fusion 167 * loads don't 168 * computations into SSA names: these aren't problematic. Their 169 result will be unused on the exit edges of the first N-1 copies 170 (those aren't taken after unrolling). If they are used on the 171 other edge (the one leading to the outer latch block) they are 172 loop-carried (on the outer loop) and the Nth copy of BB will 173 compute them again (i.e. the first N-1 copies will be dead). */ 174 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 175 { 176 gimple *g = gsi_stmt (gsi); 177 if (gimple_vdef (g) || gimple_has_side_effects (g)) 178 return true; 179 } 180 return false; 181 } 182 183 /* Given an inner loop LOOP (of some OUTER loop) determine if 184 we can safely fuse copies of it (generated by outer unrolling). 185 If so return true, otherwise return false. */ 186 187 static bool 188 unroll_jam_possible_p (class loop *outer, class loop *loop) 189 { 190 basic_block *bbs; 191 int i, n; 192 class tree_niter_desc niter; 193 194 /* When fusing the loops we skip the latch block 195 of the first one, so it mustn't have any effects to 196 preserve. */ 197 if (!empty_block_p (loop->latch)) 198 return false; 199 200 if (!single_exit (loop)) 201 return false; 202 203 /* We need a perfect nest. Quick check for adjacent inner loops. */ 204 if (outer->inner != loop || loop->next) 205 return false; 206 207 /* Prevent head-controlled inner loops, that we usually have. 208 The guard block would need to be accepted 209 (invariant condition either entering or skipping the loop), 210 without also accepting arbitrary control flow. When unswitching 211 ran before us (as with -O3) this won't be a problem because its 212 outer loop unswitching will have moved out the invariant condition. 213 214 If we do that we need to extend fuse_loops() to cope with this 215 by threading through the (still invariant) copied condition 216 between the two loop copies. */ 217 if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) 218 return false; 219 220 /* The number of iterations of the inner loop must be loop invariant 221 with respect to the outer loop. */ 222 if (!number_of_iterations_exit (loop, single_exit (loop), &niter, 223 false, true) 224 || niter.cmp == ERROR_MARK 225 || !integer_zerop (niter.may_be_zero) 226 || !expr_invariant_in_loop_p (outer, niter.niter)) 227 return false; 228 229 /* If the inner loop produces any values that are used inside the 230 outer loop (except the virtual op) then it can flow 231 back (perhaps indirectly) into the inner loop. This prevents 232 fusion: without fusion the value at the last iteration is used, 233 with fusion the value after the initial iteration is used. 234 235 If all uses are outside the outer loop this doesn't prevent fusion; 236 the value of the last iteration is still used (and the values from 237 all intermediate iterations are dead). */ 238 gphi_iterator psi; 239 for (psi = gsi_start_phis (single_exit (loop)->dest); 240 !gsi_end_p (psi); gsi_next (&psi)) 241 { 242 imm_use_iterator imm_iter; 243 use_operand_p use_p; 244 tree op = gimple_phi_result (psi.phi ()); 245 if (virtual_operand_p (op)) 246 continue; 247 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) 248 { 249 gimple *use_stmt = USE_STMT (use_p); 250 if (!is_gimple_debug (use_stmt) 251 && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) 252 return false; 253 } 254 } 255 256 /* And check blocks belonging to just outer loop. */ 257 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 258 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); 259 260 for (i = 0; i < n; i++) 261 if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) 262 break; 263 free (bbs); 264 if (i != n) 265 return false; 266 267 /* For now we can safely fuse copies of LOOP only if all 268 loop carried variables are inductions (or the virtual op). 269 270 We could handle reductions as well (the initial value in the second 271 body would be the after-iter value of the first body) if it's over 272 an associative and commutative operation. We wouldn't 273 be able to handle unknown cycles. */ 274 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) 275 { 276 affine_iv iv; 277 tree op = gimple_phi_result (psi.phi ()); 278 279 if (virtual_operand_p (op)) 280 continue; 281 if (!simple_iv (loop, loop, op, &iv, true)) 282 return false; 283 /* The inductions must be regular, loop invariant step and initial 284 value. */ 285 if (!expr_invariant_in_loop_p (outer, iv.step) 286 || !expr_invariant_in_loop_p (outer, iv.base)) 287 return false; 288 /* XXX With more effort we could also be able to deal with inductions 289 where the initial value is loop variant but a simple IV in the 290 outer loop. The initial value for the second body would be 291 the original initial value plus iv.base.step. The next value 292 for the fused loop would be the original next value of the first 293 copy, _not_ the next value of the second body. */ 294 } 295 296 return true; 297 } 298 299 /* Fuse LOOP with all further neighbors. The loops are expected to 300 be in appropriate form. */ 301 302 static void 303 fuse_loops (class loop *loop) 304 { 305 class loop *next = loop->next; 306 307 while (next) 308 { 309 edge e; 310 311 remove_branch (single_pred_edge (loop->latch)); 312 /* Make delete_basic_block not fiddle with the loop structure. */ 313 basic_block oldlatch = loop->latch; 314 loop->latch = NULL; 315 delete_basic_block (oldlatch); 316 e = redirect_edge_and_branch (loop_latch_edge (next), 317 loop->header); 318 loop->latch = e->src; 319 flush_pending_stmts (e); 320 321 gcc_assert (EDGE_COUNT (next->header->preds) == 1); 322 323 /* The PHI nodes of the second body (single-argument now) 324 need adjustments to use the right values: either directly 325 the value of the corresponding PHI in the first copy or 326 the one leaving the first body which unrolling did for us. 327 328 See also unroll_jam_possible_p() for further possibilities. */ 329 gphi_iterator psi_first, psi_second; 330 e = single_pred_edge (next->header); 331 for (psi_first = gsi_start_phis (loop->header), 332 psi_second = gsi_start_phis (next->header); 333 !gsi_end_p (psi_first); 334 gsi_next (&psi_first), gsi_next (&psi_second)) 335 { 336 gphi *phi_first = psi_first.phi (); 337 gphi *phi_second = psi_second.phi (); 338 tree firstop = gimple_phi_result (phi_first); 339 /* The virtual operand is correct already as it's 340 always live at exit, hence has a LCSSA node and outer 341 loop unrolling updated SSA form. */ 342 if (virtual_operand_p (firstop)) 343 continue; 344 345 /* Due to unroll_jam_possible_p() we know that this is 346 an induction. The second body goes over the same 347 iteration space. */ 348 add_phi_arg (phi_second, firstop, e, 349 gimple_location (phi_first)); 350 } 351 gcc_assert (gsi_end_p (psi_second)); 352 353 merge_loop_tree (loop, next); 354 gcc_assert (!next->num_nodes); 355 class loop *ln = next->next; 356 delete_loop (next); 357 next = ln; 358 } 359 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); 360 } 361 362 /* Return true if any of the access functions for dataref A 363 isn't invariant with respect to loop LOOP_NEST. */ 364 static bool 365 any_access_function_variant_p (const struct data_reference *a, 366 const class loop *loop_nest) 367 { 368 unsigned int i; 369 vec<tree> fns = DR_ACCESS_FNS (a); 370 tree t; 371 372 FOR_EACH_VEC_ELT (fns, i, t) 373 if (!evolution_function_is_invariant_p (t, loop_nest->num)) 374 return true; 375 376 return false; 377 } 378 379 /* Returns true if the distance in DDR can be determined and adjusts 380 the unroll factor in *UNROLL to make unrolling valid for that distance. 381 Otherwise return false. DDR is with respect to the outer loop of INNER. 382 383 If this data dep can lead to a removed memory reference, increment 384 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor 385 for this to happen. */ 386 387 static bool 388 adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr, 389 unsigned *unroll, unsigned *profit_unroll, 390 unsigned *removed) 391 { 392 bool ret = false; 393 if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 394 { 395 if (DDR_NUM_DIST_VECTS (ddr) == 0) 396 return false; 397 unsigned i; 398 lambda_vector dist_v; 399 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 400 { 401 /* A distance (a,b) is at worst transformed into (a/N,b) by the 402 unrolling (factor N), so the transformation is valid if 403 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll 404 factor needs to be limited so that the first condition holds. 405 That may limit the factor down to zero in the worst case. */ 406 lambda_int dist = dist_v[0]; 407 if (dist < 0) 408 gcc_unreachable (); 409 else if (dist >= (lambda_int)*unroll) 410 ; 411 else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 412 { 413 /* We have (a,0) with a < N, so this will be transformed into 414 (0,0) after unrolling by N. This might potentially be a 415 problem, if it's not a read-read dependency. */ 416 if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))) 417 ; 418 else 419 { 420 /* So, at least one is a write, and we might reduce the 421 distance vector to (0,0). This is still no problem 422 if both data-refs are affine with respect to the inner 423 loops. But if one of them is invariant with respect 424 to an inner loop our reordering implicit in loop fusion 425 corrupts the program, as our data dependences don't 426 capture this. E.g. for: 427 for (0 <= i < n) 428 for (0 <= j < m) 429 a[i][0] = a[i+1][0] + 2; // (1) 430 b[i][j] = b[i+1][j] + 2; // (2) 431 the distance vector for both statements is (-1,0), 432 but exchanging the order for (2) is okay, while 433 for (1) it is not. To see this, write out the original 434 accesses (assume m is 2): 435 a i j original 436 0 0 0 r a[1][0] b[1][0] 437 1 0 0 w a[0][0] b[0][0] 438 2 0 1 r a[1][0] b[1][1] 439 3 0 1 w a[0][0] b[0][1] 440 4 1 0 r a[2][0] b[2][0] 441 5 1 0 w a[1][0] b[1][0] 442 after unroll-by-2 and fusion the accesses are done in 443 this order (from column a): 0,1, 4,5, 2,3, i.e. this: 444 a i j transformed 445 0 0 0 r a[1][0] b[1][0] 446 1 0 0 w a[0][0] b[0][0] 447 4 1 0 r a[2][0] b[2][0] 448 5 1 0 w a[1][0] b[1][0] 449 2 0 1 r a[1][0] b[1][1] 450 3 0 1 w a[0][0] b[0][1] 451 Note how access 2 accesses the same element as access 5 452 for array 'a' but not for array 'b'. */ 453 if (any_access_function_variant_p (DDR_A (ddr), inner) 454 && any_access_function_variant_p (DDR_B (ddr), inner)) 455 ; 456 else 457 /* And if any dataref of this pair is invariant with 458 respect to the inner loop, we have no chance than 459 to reduce the unroll factor. */ 460 *unroll = dist; 461 } 462 } 463 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 464 ; 465 else 466 *unroll = dist; 467 468 /* With a distance (a,0) it's always profitable to unroll-and-jam 469 (by a+1), because one memory reference will go away. With 470 (a,b) and b != 0 that's less clear. We will increase the 471 number of streams without lowering the number of mem refs. 472 So for now only handle the first situation. */ 473 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 474 { 475 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); 476 (*removed)++; 477 } 478 479 ret = true; 480 } 481 } 482 return ret; 483 } 484 485 /* Main entry point for the unroll-and-jam transformation 486 described above. */ 487 488 static unsigned int 489 tree_loop_unroll_and_jam (void) 490 { 491 class loop *loop; 492 bool changed = false; 493 494 gcc_assert (scev_initialized_p ()); 495 496 /* Go through all innermost loops. */ 497 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 498 { 499 class loop *outer = loop_outer (loop); 500 501 if (loop_depth (loop) < 2 502 || optimize_loop_nest_for_size_p (outer)) 503 continue; 504 505 if (!unroll_jam_possible_p (outer, loop)) 506 continue; 507 508 vec<data_reference_p> datarefs; 509 vec<ddr_p> dependences; 510 unsigned unroll_factor, profit_unroll, removed; 511 class tree_niter_desc desc; 512 bool unroll = false; 513 514 auto_vec<loop_p, 3> loop_nest; 515 dependences.create (10); 516 datarefs.create (10); 517 if (!compute_data_dependences_for_loop (outer, true, &loop_nest, 518 &datarefs, &dependences)) 519 { 520 if (dump_file && (dump_flags & TDF_DETAILS)) 521 fprintf (dump_file, "Cannot analyze data dependencies\n"); 522 free_data_refs (datarefs); 523 free_dependence_relations (dependences); 524 continue; 525 } 526 if (!datarefs.length ()) 527 continue; 528 529 if (dump_file && (dump_flags & TDF_DETAILS)) 530 dump_data_dependence_relations (dump_file, dependences); 531 532 unroll_factor = (unsigned)-1; 533 profit_unroll = 1; 534 removed = 0; 535 536 /* Check all dependencies. */ 537 unsigned i; 538 struct data_dependence_relation *ddr; 539 FOR_EACH_VEC_ELT (dependences, i, ddr) 540 { 541 struct data_reference *dra, *drb; 542 543 /* If the refs are independend there's nothing to do. */ 544 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 545 continue; 546 dra = DDR_A (ddr); 547 drb = DDR_B (ddr); 548 /* Nothing interesting for the self dependencies. */ 549 if (dra == drb) 550 continue; 551 552 /* Now check the distance vector, for determining a sensible 553 outer unroll factor, and for validity of merging the inner 554 loop copies. */ 555 if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll, 556 &removed)) 557 { 558 /* Couldn't get the distance vector. For two reads that's 559 harmless (we assume we should unroll). For at least 560 one write this means we can't check the dependence direction 561 and hence can't determine safety. */ 562 563 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) 564 { 565 unroll_factor = 0; 566 break; 567 } 568 } 569 } 570 571 /* We regard a user-specified minimum percentage of zero as a request 572 to ignore all profitability concerns and apply the transformation 573 always. */ 574 if (!param_unroll_jam_min_percent) 575 profit_unroll = MAX(2, profit_unroll); 576 else if (removed * 100 / datarefs.length () 577 < (unsigned)param_unroll_jam_min_percent) 578 profit_unroll = 1; 579 if (unroll_factor > profit_unroll) 580 unroll_factor = profit_unroll; 581 if (unroll_factor > (unsigned)param_unroll_jam_max_unroll) 582 unroll_factor = param_unroll_jam_max_unroll; 583 unroll = (unroll_factor > 1 584 && can_unroll_loop_p (outer, unroll_factor, &desc)); 585 586 if (unroll) 587 { 588 if (dump_enabled_p ()) 589 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, 590 find_loop_location (outer), 591 "applying unroll and jam with factor %d\n", 592 unroll_factor); 593 initialize_original_copy_tables (); 594 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), 595 &desc); 596 free_original_copy_tables (); 597 fuse_loops (outer->inner); 598 changed = true; 599 } 600 601 loop_nest.release (); 602 free_dependence_relations (dependences); 603 free_data_refs (datarefs); 604 } 605 606 if (changed) 607 { 608 scev_reset (); 609 free_dominance_info (CDI_DOMINATORS); 610 return TODO_cleanup_cfg; 611 } 612 return 0; 613 } 614 615 /* Pass boilerplate */ 616 617 namespace { 618 619 const pass_data pass_data_loop_jam = 620 { 621 GIMPLE_PASS, /* type */ 622 "unrolljam", /* name */ 623 OPTGROUP_LOOP, /* optinfo_flags */ 624 TV_LOOP_JAM, /* tv_id */ 625 PROP_cfg, /* properties_required */ 626 0, /* properties_provided */ 627 0, /* properties_destroyed */ 628 0, /* todo_flags_start */ 629 0, /* todo_flags_finish */ 630 }; 631 632 class pass_loop_jam : public gimple_opt_pass 633 { 634 public: 635 pass_loop_jam (gcc::context *ctxt) 636 : gimple_opt_pass (pass_data_loop_jam, ctxt) 637 {} 638 639 /* opt_pass methods: */ 640 virtual bool gate (function *) { return flag_unroll_jam != 0; } 641 virtual unsigned int execute (function *); 642 643 }; 644 645 unsigned int 646 pass_loop_jam::execute (function *fun) 647 { 648 if (number_of_loops (fun) <= 1) 649 return 0; 650 651 return tree_loop_unroll_and_jam (); 652 } 653 654 } 655 656 gimple_opt_pass * 657 make_pass_loop_jam (gcc::context *ctxt) 658 { 659 return new pass_loop_jam (ctxt); 660 } 661