1 /* Swing Modulo Scheduling implementation. 2 Copyright (C) 2004, 2005, 2006, 2007, 2008 3 Free Software Foundation, Inc. 4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com> 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 23 #include "config.h" 24 #include "system.h" 25 #include "coretypes.h" 26 #include "tm.h" 27 #include "toplev.h" 28 #include "rtl.h" 29 #include "tm_p.h" 30 #include "hard-reg-set.h" 31 #include "regs.h" 32 #include "function.h" 33 #include "flags.h" 34 #include "insn-config.h" 35 #include "insn-attr.h" 36 #include "except.h" 37 #include "toplev.h" 38 #include "recog.h" 39 #include "sched-int.h" 40 #include "target.h" 41 #include "cfglayout.h" 42 #include "cfgloop.h" 43 #include "cfghooks.h" 44 #include "expr.h" 45 #include "params.h" 46 #include "gcov-io.h" 47 #include "ddg.h" 48 #include "timevar.h" 49 #include "tree-pass.h" 50 #include "dbgcnt.h" 51 52 #ifdef INSN_SCHEDULING 53 54 /* This file contains the implementation of the Swing Modulo Scheduler, 55 described in the following references: 56 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt. 57 Lifetime--sensitive modulo scheduling in a production environment. 58 IEEE Trans. on Comps., 50(3), March 2001 59 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero. 60 Swing Modulo Scheduling: A Lifetime Sensitive Approach. 61 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA). 62 63 The basic structure is: 64 1. Build a data-dependence graph (DDG) for each loop. 65 2. Use the DDG to order the insns of a loop (not in topological order 66 necessarily, but rather) trying to place each insn after all its 67 predecessors _or_ after all its successors. 68 3. Compute MII: a lower bound on the number of cycles to schedule the loop. 69 4. Use the ordering to perform list-scheduling of the loop: 70 1. Set II = MII. We will try to schedule the loop within II cycles. 71 2. Try to schedule the insns one by one according to the ordering. 72 For each insn compute an interval of cycles by considering already- 73 scheduled preds and succs (and associated latencies); try to place 74 the insn in the cycles of this window checking for potential 75 resource conflicts (using the DFA interface). 76 Note: this is different from the cycle-scheduling of schedule_insns; 77 here the insns are not scheduled monotonically top-down (nor bottom- 78 up). 79 3. If failed in scheduling all insns - bump II++ and try again, unless 80 II reaches an upper bound MaxII, in which case report failure. 81 5. If we succeeded in scheduling the loop within II cycles, we now 82 generate prolog and epilog, decrease the counter of the loop, and 83 perform modulo variable expansion for live ranges that span more than 84 II cycles (i.e. use register copies to prevent a def from overwriting 85 itself before reaching the use). 86 87 SMS works with countable loops (1) whose control part can be easily 88 decoupled from the rest of the loop and (2) whose loop count can 89 be easily adjusted. This is because we peel a constant number of 90 iterations into a prologue and epilogue for which we want to avoid 91 emitting the control part, and a kernel which is to iterate that 92 constant number of iterations less than the original loop. So the 93 control part should be a set of insns clearly identified and having 94 its own iv, not otherwise used in the loop (at-least for now), which 95 initializes a register before the loop to the number of iterations. 96 Currently SMS relies on the do-loop pattern to recognize such loops, 97 where (1) the control part comprises of all insns defining and/or 98 using a certain 'count' register and (2) the loop count can be 99 adjusted by modifying this register prior to the loop. 100 TODO: Rely on cfgloop analysis instead. */ 101 102 /* This page defines partial-schedule structures and functions for 103 modulo scheduling. */ 104 105 typedef struct partial_schedule *partial_schedule_ptr; 106 typedef struct ps_insn *ps_insn_ptr; 107 108 /* The minimum (absolute) cycle that a node of ps was scheduled in. */ 109 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle) 110 111 /* The maximum (absolute) cycle that a node of ps was scheduled in. */ 112 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle) 113 114 /* Perform signed modulo, always returning a non-negative value. */ 115 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y)) 116 117 /* The number of different iterations the nodes in ps span, assuming 118 the stage boundaries are placed efficiently. */ 119 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \ 120 + 1 + (ps)->ii - 1) / (ps)->ii) 121 122 /* A single instruction in the partial schedule. */ 123 struct ps_insn 124 { 125 /* The corresponding DDG_NODE. */ 126 ddg_node_ptr node; 127 128 /* The (absolute) cycle in which the PS instruction is scheduled. 129 Same as SCHED_TIME (node). */ 130 int cycle; 131 132 /* The next/prev PS_INSN in the same row. */ 133 ps_insn_ptr next_in_row, 134 prev_in_row; 135 136 /* The number of nodes in the same row that come after this node. */ 137 int row_rest_count; 138 }; 139 140 /* Holds the partial schedule as an array of II rows. Each entry of the 141 array points to a linked list of PS_INSNs, which represents the 142 instructions that are scheduled for that row. */ 143 struct partial_schedule 144 { 145 int ii; /* Number of rows in the partial schedule. */ 146 int history; /* Threshold for conflict checking using DFA. */ 147 148 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */ 149 ps_insn_ptr *rows; 150 151 /* The earliest absolute cycle of an insn in the partial schedule. */ 152 int min_cycle; 153 154 /* The latest absolute cycle of an insn in the partial schedule. */ 155 int max_cycle; 156 157 ddg_ptr g; /* The DDG of the insns in the partial schedule. */ 158 }; 159 160 /* We use this to record all the register replacements we do in 161 the kernel so we can undo SMS if it is not profitable. */ 162 struct undo_replace_buff_elem 163 { 164 rtx insn; 165 rtx orig_reg; 166 rtx new_reg; 167 struct undo_replace_buff_elem *next; 168 }; 169 170 171 172 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history); 173 static void free_partial_schedule (partial_schedule_ptr); 174 static void reset_partial_schedule (partial_schedule_ptr, int new_ii); 175 void print_partial_schedule (partial_schedule_ptr, FILE *); 176 static void verify_partial_schedule (partial_schedule_ptr, sbitmap); 177 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr, 178 ddg_node_ptr node, int cycle, 179 sbitmap must_precede, 180 sbitmap must_follow); 181 static void rotate_partial_schedule (partial_schedule_ptr, int); 182 void set_row_column_for_ps (partial_schedule_ptr); 183 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap); 184 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr); 185 186 187 /* This page defines constants and structures for the modulo scheduling 188 driver. */ 189 190 static int sms_order_nodes (ddg_ptr, int, int *, int *); 191 static void set_node_sched_params (ddg_ptr); 192 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *); 193 static void permute_partial_schedule (partial_schedule_ptr, rtx); 194 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *, 195 rtx, rtx); 196 static void duplicate_insns_of_cycles (partial_schedule_ptr, 197 int, int, int, rtx); 198 199 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap) 200 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time) 201 #define SCHED_FIRST_REG_MOVE(x) \ 202 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move) 203 #define SCHED_NREG_MOVES(x) \ 204 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves) 205 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row) 206 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage) 207 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column) 208 209 /* The scheduling parameters held for each node. */ 210 typedef struct node_sched_params 211 { 212 int asap; /* A lower-bound on the absolute scheduling cycle. */ 213 int time; /* The absolute scheduling cycle (time >= asap). */ 214 215 /* The following field (first_reg_move) is a pointer to the first 216 register-move instruction added to handle the modulo-variable-expansion 217 of the register defined by this node. This register-move copies the 218 original register defined by the node. */ 219 rtx first_reg_move; 220 221 /* The number of register-move instructions added, immediately preceding 222 first_reg_move. */ 223 int nreg_moves; 224 225 int row; /* Holds time % ii. */ 226 int stage; /* Holds time / ii. */ 227 228 /* The column of a node inside the ps. If nodes u, v are on the same row, 229 u will precede v if column (u) < column (v). */ 230 int column; 231 } *node_sched_params_ptr; 232 233 234 /* The following three functions are copied from the current scheduler 235 code in order to use sched_analyze() for computing the dependencies. 236 They are used when initializing the sched_info structure. */ 237 static const char * 238 sms_print_insn (const_rtx insn, int aligned ATTRIBUTE_UNUSED) 239 { 240 static char tmp[80]; 241 242 sprintf (tmp, "i%4d", INSN_UID (insn)); 243 return tmp; 244 } 245 246 static void 247 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED, 248 regset cond_exec ATTRIBUTE_UNUSED, 249 regset used ATTRIBUTE_UNUSED, 250 regset set ATTRIBUTE_UNUSED) 251 { 252 } 253 254 static struct common_sched_info_def sms_common_sched_info; 255 256 static struct sched_deps_info_def sms_sched_deps_info = 257 { 258 compute_jump_reg_dependencies, 259 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 260 NULL, 261 0, 0, 0 262 }; 263 264 static struct haifa_sched_info sms_sched_info = 265 { 266 NULL, 267 NULL, 268 NULL, 269 NULL, 270 NULL, 271 sms_print_insn, 272 NULL, 273 NULL, /* insn_finishes_block_p */ 274 NULL, NULL, 275 NULL, NULL, 276 0, 0, 277 278 NULL, NULL, NULL, 279 0 280 }; 281 282 /* Given HEAD and TAIL which are the first and last insns in a loop; 283 return the register which controls the loop. Return zero if it has 284 more than one occurrence in the loop besides the control part or the 285 do-loop pattern is not of the form we expect. */ 286 static rtx 287 doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED) 288 { 289 #ifdef HAVE_doloop_end 290 rtx reg, condition, insn, first_insn_not_to_check; 291 292 if (!JUMP_P (tail)) 293 return NULL_RTX; 294 295 /* TODO: Free SMS's dependence on doloop_condition_get. */ 296 condition = doloop_condition_get (tail); 297 if (! condition) 298 return NULL_RTX; 299 300 if (REG_P (XEXP (condition, 0))) 301 reg = XEXP (condition, 0); 302 else if (GET_CODE (XEXP (condition, 0)) == PLUS 303 && REG_P (XEXP (XEXP (condition, 0), 0))) 304 reg = XEXP (XEXP (condition, 0), 0); 305 else 306 gcc_unreachable (); 307 308 /* Check that the COUNT_REG has no other occurrences in the loop 309 until the decrement. We assume the control part consists of 310 either a single (parallel) branch-on-count or a (non-parallel) 311 branch immediately preceded by a single (decrement) insn. */ 312 first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail 313 : PREV_INSN (tail)); 314 315 for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn)) 316 if (reg_mentioned_p (reg, insn)) 317 { 318 if (dump_file) 319 { 320 fprintf (dump_file, "SMS count_reg found "); 321 print_rtl_single (dump_file, reg); 322 fprintf (dump_file, " outside control in insn:\n"); 323 print_rtl_single (dump_file, insn); 324 } 325 326 return NULL_RTX; 327 } 328 329 return reg; 330 #else 331 return NULL_RTX; 332 #endif 333 } 334 335 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so 336 that the number of iterations is a compile-time constant. If so, 337 return the rtx that sets COUNT_REG to a constant, and set COUNT to 338 this constant. Otherwise return 0. */ 339 static rtx 340 const_iteration_count (rtx count_reg, basic_block pre_header, 341 HOST_WIDEST_INT * count) 342 { 343 rtx insn; 344 rtx head, tail; 345 346 if (! pre_header) 347 return NULL_RTX; 348 349 get_ebb_head_tail (pre_header, pre_header, &head, &tail); 350 351 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn)) 352 if (NONDEBUG_INSN_P (insn) && single_set (insn) && 353 rtx_equal_p (count_reg, SET_DEST (single_set (insn)))) 354 { 355 rtx pat = single_set (insn); 356 357 if (CONST_INT_P (SET_SRC (pat))) 358 { 359 *count = INTVAL (SET_SRC (pat)); 360 return insn; 361 } 362 363 return NULL_RTX; 364 } 365 366 return NULL_RTX; 367 } 368 369 /* A very simple resource-based lower bound on the initiation interval. 370 ??? Improve the accuracy of this bound by considering the 371 utilization of various units. */ 372 static int 373 res_MII (ddg_ptr g) 374 { 375 if (targetm.sched.sms_res_mii) 376 return targetm.sched.sms_res_mii (g); 377 378 return ((g->num_nodes - g->num_debug) / issue_rate); 379 } 380 381 382 /* Points to the array that contains the sched data for each node. */ 383 static node_sched_params_ptr node_sched_params; 384 385 /* Allocate sched_params for each node and initialize it. Assumes that 386 the aux field of each node contain the asap bound (computed earlier), 387 and copies it into the sched_params field. */ 388 static void 389 set_node_sched_params (ddg_ptr g) 390 { 391 int i; 392 393 /* Allocate for each node in the DDG a place to hold the "sched_data". */ 394 /* Initialize ASAP/ALAP/HIGHT to zero. */ 395 node_sched_params = (node_sched_params_ptr) 396 xcalloc (g->num_nodes, 397 sizeof (struct node_sched_params)); 398 399 /* Set the pointer of the general data of the node to point to the 400 appropriate sched_params structure. */ 401 for (i = 0; i < g->num_nodes; i++) 402 { 403 /* Watch out for aliasing problems? */ 404 node_sched_params[i].asap = g->nodes[i].aux.count; 405 g->nodes[i].aux.info = &node_sched_params[i]; 406 } 407 } 408 409 static void 410 print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g) 411 { 412 int i; 413 414 if (! file) 415 return; 416 for (i = 0; i < num_nodes; i++) 417 { 418 node_sched_params_ptr nsp = &node_sched_params[i]; 419 rtx reg_move = nsp->first_reg_move; 420 int j; 421 422 fprintf (file, "Node = %d; INSN = %d\n", i, 423 (INSN_UID (g->nodes[i].insn))); 424 fprintf (file, " asap = %d:\n", nsp->asap); 425 fprintf (file, " time = %d:\n", nsp->time); 426 fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves); 427 for (j = 0; j < nsp->nreg_moves; j++) 428 { 429 fprintf (file, " reg_move = "); 430 print_rtl_single (file, reg_move); 431 reg_move = PREV_INSN (reg_move); 432 } 433 } 434 } 435 436 /* 437 Breaking intra-loop register anti-dependences: 438 Each intra-loop register anti-dependence implies a cross-iteration true 439 dependence of distance 1. Therefore, we can remove such false dependencies 440 and figure out if the partial schedule broke them by checking if (for a 441 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and 442 if so generate a register move. The number of such moves is equal to: 443 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken 444 nreg_moves = ----------------------------------- + 1 - { dependence. 445 ii { 1 if not. 446 */ 447 static struct undo_replace_buff_elem * 448 generate_reg_moves (partial_schedule_ptr ps, bool rescan) 449 { 450 ddg_ptr g = ps->g; 451 int ii = ps->ii; 452 int i; 453 struct undo_replace_buff_elem *reg_move_replaces = NULL; 454 455 for (i = 0; i < g->num_nodes; i++) 456 { 457 ddg_node_ptr u = &g->nodes[i]; 458 ddg_edge_ptr e; 459 int nreg_moves = 0, i_reg_move; 460 sbitmap *uses_of_defs; 461 rtx last_reg_move; 462 rtx prev_reg, old_reg; 463 464 /* Compute the number of reg_moves needed for u, by looking at life 465 ranges started at u (excluding self-loops). */ 466 for (e = u->out; e; e = e->next_out) 467 if (e->type == TRUE_DEP && e->dest != e->src) 468 { 469 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii; 470 471 if (e->distance == 1) 472 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii; 473 474 /* If dest precedes src in the schedule of the kernel, then dest 475 will read before src writes and we can save one reg_copy. */ 476 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src) 477 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src)) 478 nreg_moves4e--; 479 480 nreg_moves = MAX (nreg_moves, nreg_moves4e); 481 } 482 483 if (nreg_moves == 0) 484 continue; 485 486 /* Every use of the register defined by node may require a different 487 copy of this register, depending on the time the use is scheduled. 488 Set a bitmap vector, telling which nodes use each copy of this 489 register. */ 490 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes); 491 sbitmap_vector_zero (uses_of_defs, nreg_moves); 492 for (e = u->out; e; e = e->next_out) 493 if (e->type == TRUE_DEP && e->dest != e->src) 494 { 495 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii; 496 497 if (e->distance == 1) 498 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii; 499 500 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src) 501 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src)) 502 dest_copy--; 503 504 if (dest_copy) 505 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid); 506 } 507 508 /* Now generate the reg_moves, attaching relevant uses to them. */ 509 SCHED_NREG_MOVES (u) = nreg_moves; 510 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn))); 511 /* Insert the reg-moves right before the notes which precede 512 the insn they relates to. */ 513 last_reg_move = u->first_note; 514 515 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++) 516 { 517 unsigned int i_use = 0; 518 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg)); 519 rtx reg_move = gen_move_insn (new_reg, prev_reg); 520 sbitmap_iterator sbi; 521 522 add_insn_before (reg_move, last_reg_move, NULL); 523 last_reg_move = reg_move; 524 525 if (!SCHED_FIRST_REG_MOVE (u)) 526 SCHED_FIRST_REG_MOVE (u) = reg_move; 527 528 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi) 529 { 530 struct undo_replace_buff_elem *rep; 531 532 rep = (struct undo_replace_buff_elem *) 533 xcalloc (1, sizeof (struct undo_replace_buff_elem)); 534 rep->insn = g->nodes[i_use].insn; 535 rep->orig_reg = old_reg; 536 rep->new_reg = new_reg; 537 538 if (! reg_move_replaces) 539 reg_move_replaces = rep; 540 else 541 { 542 rep->next = reg_move_replaces; 543 reg_move_replaces = rep; 544 } 545 546 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg); 547 if (rescan) 548 df_insn_rescan (g->nodes[i_use].insn); 549 } 550 551 prev_reg = new_reg; 552 } 553 sbitmap_vector_free (uses_of_defs); 554 } 555 return reg_move_replaces; 556 } 557 558 /* Free memory allocated for the undo buffer. */ 559 static void 560 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces) 561 { 562 563 while (reg_move_replaces) 564 { 565 struct undo_replace_buff_elem *rep = reg_move_replaces; 566 567 reg_move_replaces = reg_move_replaces->next; 568 free (rep); 569 } 570 } 571 572 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values 573 of SCHED_ROW and SCHED_STAGE. */ 574 static void 575 normalize_sched_times (partial_schedule_ptr ps) 576 { 577 int row; 578 int amount = PS_MIN_CYCLE (ps); 579 int ii = ps->ii; 580 ps_insn_ptr crr_insn; 581 582 for (row = 0; row < ii; row++) 583 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row) 584 { 585 ddg_node_ptr u = crr_insn->node; 586 int normalized_time = SCHED_TIME (u) - amount; 587 588 if (dump_file) 589 fprintf (dump_file, "crr_insn->node=%d, crr_insn->cycle=%d,\ 590 min_cycle=%d\n", crr_insn->node->cuid, SCHED_TIME 591 (u), ps->min_cycle); 592 gcc_assert (SCHED_TIME (u) >= ps->min_cycle); 593 gcc_assert (SCHED_TIME (u) <= ps->max_cycle); 594 SCHED_TIME (u) = normalized_time; 595 SCHED_ROW (u) = normalized_time % ii; 596 SCHED_STAGE (u) = normalized_time / ii; 597 } 598 } 599 600 /* Set SCHED_COLUMN of each node according to its position in PS. */ 601 static void 602 set_columns_for_ps (partial_schedule_ptr ps) 603 { 604 int row; 605 606 for (row = 0; row < ps->ii; row++) 607 { 608 ps_insn_ptr cur_insn = ps->rows[row]; 609 int column = 0; 610 611 for (; cur_insn; cur_insn = cur_insn->next_in_row) 612 SCHED_COLUMN (cur_insn->node) = column++; 613 } 614 } 615 616 /* Permute the insns according to their order in PS, from row 0 to 617 row ii-1, and position them right before LAST. This schedules 618 the insns of the loop kernel. */ 619 static void 620 permute_partial_schedule (partial_schedule_ptr ps, rtx last) 621 { 622 int ii = ps->ii; 623 int row; 624 ps_insn_ptr ps_ij; 625 626 for (row = 0; row < ii ; row++) 627 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row) 628 if (PREV_INSN (last) != ps_ij->node->insn) 629 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn, 630 PREV_INSN (last)); 631 } 632 633 static void 634 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, 635 int to_stage, int for_prolog, rtx count_reg) 636 { 637 int row; 638 ps_insn_ptr ps_ij; 639 640 for (row = 0; row < ps->ii; row++) 641 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row) 642 { 643 ddg_node_ptr u_node = ps_ij->node; 644 int j, i_reg_moves; 645 rtx reg_move = NULL_RTX; 646 647 /* Do not duplicate any insn which refers to count_reg as it 648 belongs to the control part. 649 TODO: This should be done by analyzing the control part of 650 the loop. */ 651 if (reg_mentioned_p (count_reg, u_node->insn)) 652 continue; 653 654 if (for_prolog) 655 { 656 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing 657 number of reg_moves starting with the second occurrence of 658 u_node, which is generated if its SCHED_STAGE <= to_stage. */ 659 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1; 660 i_reg_moves = MAX (i_reg_moves, 0); 661 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node)); 662 663 /* The reg_moves start from the *first* reg_move backwards. */ 664 if (i_reg_moves) 665 { 666 reg_move = SCHED_FIRST_REG_MOVE (u_node); 667 for (j = 1; j < i_reg_moves; j++) 668 reg_move = PREV_INSN (reg_move); 669 } 670 } 671 else /* It's for the epilog. */ 672 { 673 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves, 674 starting to decrease one stage after u_node no longer occurs; 675 that is, generate all reg_moves until 676 SCHED_STAGE (u_node) == from_stage - 1. */ 677 i_reg_moves = SCHED_NREG_MOVES (u_node) 678 - (from_stage - SCHED_STAGE (u_node) - 1); 679 i_reg_moves = MAX (i_reg_moves, 0); 680 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node)); 681 682 /* The reg_moves start from the *last* reg_move forwards. */ 683 if (i_reg_moves) 684 { 685 reg_move = SCHED_FIRST_REG_MOVE (u_node); 686 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++) 687 reg_move = PREV_INSN (reg_move); 688 } 689 } 690 691 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move)) 692 emit_insn (copy_rtx (PATTERN (reg_move))); 693 if (SCHED_STAGE (u_node) >= from_stage 694 && SCHED_STAGE (u_node) <= to_stage) 695 duplicate_insn_chain (u_node->first_note, u_node->insn); 696 } 697 } 698 699 700 /* Generate the instructions (including reg_moves) for prolog & epilog. */ 701 static void 702 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop, 703 rtx count_reg, rtx count_init) 704 { 705 int i; 706 int last_stage = PS_STAGE_COUNT (ps) - 1; 707 edge e; 708 709 /* Generate the prolog, inserting its insns on the loop-entry edge. */ 710 start_sequence (); 711 712 if (!count_init) 713 { 714 /* Generate instructions at the beginning of the prolog to 715 adjust the loop count by STAGE_COUNT. If loop count is constant 716 (count_init), this constant is adjusted by STAGE_COUNT in 717 generate_prolog_epilog function. */ 718 rtx sub_reg = NULL_RTX; 719 720 sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, 721 count_reg, GEN_INT (last_stage), 722 count_reg, 1, OPTAB_DIRECT); 723 gcc_assert (REG_P (sub_reg)); 724 if (REGNO (sub_reg) != REGNO (count_reg)) 725 emit_move_insn (count_reg, sub_reg); 726 } 727 728 for (i = 0; i < last_stage; i++) 729 duplicate_insns_of_cycles (ps, 0, i, 1, count_reg); 730 731 /* Put the prolog on the entry edge. */ 732 e = loop_preheader_edge (loop); 733 split_edge_and_insert (e, get_insns ()); 734 735 end_sequence (); 736 737 /* Generate the epilog, inserting its insns on the loop-exit edge. */ 738 start_sequence (); 739 740 for (i = 0; i < last_stage; i++) 741 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg); 742 743 /* Put the epilogue on the exit edge. */ 744 gcc_assert (single_exit (loop)); 745 e = single_exit (loop); 746 split_edge_and_insert (e, get_insns ()); 747 end_sequence (); 748 } 749 750 /* Return true if all the BBs of the loop are empty except the 751 loop header. */ 752 static bool 753 loop_single_full_bb_p (struct loop *loop) 754 { 755 unsigned i; 756 basic_block *bbs = get_loop_body (loop); 757 758 for (i = 0; i < loop->num_nodes ; i++) 759 { 760 rtx head, tail; 761 bool empty_bb = true; 762 763 if (bbs[i] == loop->header) 764 continue; 765 766 /* Make sure that basic blocks other than the header 767 have only notes labels or jumps. */ 768 get_ebb_head_tail (bbs[i], bbs[i], &head, &tail); 769 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head)) 770 { 771 if (NOTE_P (head) || LABEL_P (head) 772 || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head)))) 773 continue; 774 empty_bb = false; 775 break; 776 } 777 778 if (! empty_bb) 779 { 780 free (bbs); 781 return false; 782 } 783 } 784 free (bbs); 785 return true; 786 } 787 788 /* A simple loop from SMS point of view; it is a loop that is composed of 789 either a single basic block or two BBs - a header and a latch. */ 790 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \ 791 && (EDGE_COUNT (loop->latch->preds) == 1) \ 792 && (EDGE_COUNT (loop->latch->succs) == 1)) 793 794 /* Return true if the loop is in its canonical form and false if not. 795 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */ 796 static bool 797 loop_canon_p (struct loop *loop) 798 { 799 800 if (loop->inner || !loop_outer (loop)) 801 { 802 if (dump_file) 803 fprintf (dump_file, "SMS loop inner or !loop_outer\n"); 804 return false; 805 } 806 807 if (!single_exit (loop)) 808 { 809 if (dump_file) 810 { 811 rtx insn = BB_END (loop->header); 812 813 fprintf (dump_file, "SMS loop many exits "); 814 fprintf (dump_file, " %s %d (file, line)\n", 815 insn_file (insn), insn_line (insn)); 816 } 817 return false; 818 } 819 820 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop)) 821 { 822 if (dump_file) 823 { 824 rtx insn = BB_END (loop->header); 825 826 fprintf (dump_file, "SMS loop many BBs. "); 827 fprintf (dump_file, " %s %d (file, line)\n", 828 insn_file (insn), insn_line (insn)); 829 } 830 return false; 831 } 832 833 return true; 834 } 835 836 /* If there are more than one entry for the loop, 837 make it one by splitting the first entry edge and 838 redirecting the others to the new BB. */ 839 static void 840 canon_loop (struct loop *loop) 841 { 842 edge e; 843 edge_iterator i; 844 845 /* Avoid annoying special cases of edges going to exit 846 block. */ 847 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds) 848 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1)) 849 split_edge (e); 850 851 if (loop->latch == loop->header 852 || EDGE_COUNT (loop->latch->succs) > 1) 853 { 854 FOR_EACH_EDGE (e, i, loop->header->preds) 855 if (e->src == loop->latch) 856 break; 857 split_edge (e); 858 } 859 } 860 861 /* Setup infos. */ 862 static void 863 setup_sched_infos (void) 864 { 865 memcpy (&sms_common_sched_info, &haifa_common_sched_info, 866 sizeof (sms_common_sched_info)); 867 sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS; 868 common_sched_info = &sms_common_sched_info; 869 870 sched_deps_info = &sms_sched_deps_info; 871 current_sched_info = &sms_sched_info; 872 } 873 874 /* Probability in % that the sms-ed loop rolls enough so that optimized 875 version may be entered. Just a guess. */ 876 #define PROB_SMS_ENOUGH_ITERATIONS 80 877 878 /* Used to calculate the upper bound of ii. */ 879 #define MAXII_FACTOR 2 880 881 /* Main entry point, perform SMS scheduling on the loops of the function 882 that consist of single basic blocks. */ 883 static void 884 sms_schedule (void) 885 { 886 rtx insn; 887 ddg_ptr *g_arr, g; 888 int * node_order; 889 int maxii, max_asap; 890 loop_iterator li; 891 partial_schedule_ptr ps; 892 basic_block bb = NULL; 893 struct loop *loop; 894 basic_block condition_bb = NULL; 895 edge latch_edge; 896 gcov_type trip_count = 0; 897 898 loop_optimizer_init (LOOPS_HAVE_PREHEADERS 899 | LOOPS_HAVE_RECORDED_EXITS); 900 if (number_of_loops () <= 1) 901 { 902 loop_optimizer_finalize (); 903 return; /* There are no loops to schedule. */ 904 } 905 906 /* Initialize issue_rate. */ 907 if (targetm.sched.issue_rate) 908 { 909 int temp = reload_completed; 910 911 reload_completed = 1; 912 issue_rate = targetm.sched.issue_rate (); 913 reload_completed = temp; 914 } 915 else 916 issue_rate = 1; 917 918 /* Initialize the scheduler. */ 919 setup_sched_infos (); 920 haifa_sched_init (); 921 922 /* Allocate memory to hold the DDG array one entry for each loop. 923 We use loop->num as index into this array. */ 924 g_arr = XCNEWVEC (ddg_ptr, number_of_loops ()); 925 926 if (dump_file) 927 { 928 fprintf (dump_file, "\n\nSMS analysis phase\n"); 929 fprintf (dump_file, "===================\n\n"); 930 } 931 932 /* Build DDGs for all the relevant loops and hold them in G_ARR 933 indexed by the loop index. */ 934 FOR_EACH_LOOP (li, loop, 0) 935 { 936 rtx head, tail; 937 rtx count_reg; 938 939 /* For debugging. */ 940 if (dbg_cnt (sms_sched_loop) == false) 941 { 942 if (dump_file) 943 fprintf (dump_file, "SMS reached max limit... \n"); 944 945 break; 946 } 947 948 if (dump_file) 949 { 950 rtx insn = BB_END (loop->header); 951 952 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n", 953 loop->num, insn_file (insn), insn_line (insn)); 954 955 } 956 957 if (! loop_canon_p (loop)) 958 continue; 959 960 if (! loop_single_full_bb_p (loop)) 961 { 962 if (dump_file) 963 fprintf (dump_file, "SMS not loop_single_full_bb_p\n"); 964 continue; 965 } 966 967 bb = loop->header; 968 969 get_ebb_head_tail (bb, bb, &head, &tail); 970 latch_edge = loop_latch_edge (loop); 971 gcc_assert (single_exit (loop)); 972 if (single_exit (loop)->count) 973 trip_count = latch_edge->count / single_exit (loop)->count; 974 975 /* Perform SMS only on loops that their average count is above threshold. */ 976 977 if ( latch_edge->count 978 && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)) 979 { 980 if (dump_file) 981 { 982 fprintf (dump_file, " %s %d (file, line)\n", 983 insn_file (tail), insn_line (tail)); 984 fprintf (dump_file, "SMS single-bb-loop\n"); 985 if (profile_info && flag_branch_probabilities) 986 { 987 fprintf (dump_file, "SMS loop-count "); 988 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 989 (HOST_WIDEST_INT) bb->count); 990 fprintf (dump_file, "\n"); 991 fprintf (dump_file, "SMS trip-count "); 992 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 993 (HOST_WIDEST_INT) trip_count); 994 fprintf (dump_file, "\n"); 995 fprintf (dump_file, "SMS profile-sum-max "); 996 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 997 (HOST_WIDEST_INT) profile_info->sum_max); 998 fprintf (dump_file, "\n"); 999 } 1000 } 1001 continue; 1002 } 1003 1004 /* Make sure this is a doloop. */ 1005 if ( !(count_reg = doloop_register_get (head, tail))) 1006 { 1007 if (dump_file) 1008 fprintf (dump_file, "SMS doloop_register_get failed\n"); 1009 continue; 1010 } 1011 1012 /* Don't handle BBs with calls or barriers, or !single_set insns, 1013 or auto-increment insns (to avoid creating invalid reg-moves 1014 for the auto-increment insns). 1015 ??? Should handle auto-increment insns. 1016 ??? Should handle insns defining subregs. */ 1017 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) 1018 { 1019 rtx set; 1020 1021 if (CALL_P (insn) 1022 || BARRIER_P (insn) 1023 || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn) 1024 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE) 1025 || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0) 1026 || (INSN_P (insn) && (set = single_set (insn)) 1027 && GET_CODE (SET_DEST (set)) == SUBREG)) 1028 break; 1029 } 1030 1031 if (insn != NEXT_INSN (tail)) 1032 { 1033 if (dump_file) 1034 { 1035 if (CALL_P (insn)) 1036 fprintf (dump_file, "SMS loop-with-call\n"); 1037 else if (BARRIER_P (insn)) 1038 fprintf (dump_file, "SMS loop-with-barrier\n"); 1039 else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0) 1040 fprintf (dump_file, "SMS reg inc\n"); 1041 else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn) 1042 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)) 1043 fprintf (dump_file, "SMS loop-with-not-single-set\n"); 1044 else 1045 fprintf (dump_file, "SMS loop with subreg in lhs\n"); 1046 print_rtl_single (dump_file, insn); 1047 } 1048 1049 continue; 1050 } 1051 1052 if (! (g = create_ddg (bb, 0))) 1053 { 1054 if (dump_file) 1055 fprintf (dump_file, "SMS create_ddg failed\n"); 1056 continue; 1057 } 1058 1059 g_arr[loop->num] = g; 1060 if (dump_file) 1061 fprintf (dump_file, "...OK\n"); 1062 1063 } 1064 if (dump_file) 1065 { 1066 fprintf (dump_file, "\nSMS transformation phase\n"); 1067 fprintf (dump_file, "=========================\n\n"); 1068 } 1069 1070 /* We don't want to perform SMS on new loops - created by versioning. */ 1071 FOR_EACH_LOOP (li, loop, 0) 1072 { 1073 rtx head, tail; 1074 rtx count_reg, count_init; 1075 int mii, rec_mii; 1076 unsigned stage_count = 0; 1077 HOST_WIDEST_INT loop_count = 0; 1078 1079 if (! (g = g_arr[loop->num])) 1080 continue; 1081 1082 if (dump_file) 1083 { 1084 rtx insn = BB_END (loop->header); 1085 1086 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n", 1087 loop->num, insn_file (insn), insn_line (insn)); 1088 1089 print_ddg (dump_file, g); 1090 } 1091 1092 get_ebb_head_tail (loop->header, loop->header, &head, &tail); 1093 1094 latch_edge = loop_latch_edge (loop); 1095 gcc_assert (single_exit (loop)); 1096 if (single_exit (loop)->count) 1097 trip_count = latch_edge->count / single_exit (loop)->count; 1098 1099 if (dump_file) 1100 { 1101 fprintf (dump_file, " %s %d (file, line)\n", 1102 insn_file (tail), insn_line (tail)); 1103 fprintf (dump_file, "SMS single-bb-loop\n"); 1104 if (profile_info && flag_branch_probabilities) 1105 { 1106 fprintf (dump_file, "SMS loop-count "); 1107 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1108 (HOST_WIDEST_INT) bb->count); 1109 fprintf (dump_file, "\n"); 1110 fprintf (dump_file, "SMS profile-sum-max "); 1111 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1112 (HOST_WIDEST_INT) profile_info->sum_max); 1113 fprintf (dump_file, "\n"); 1114 } 1115 fprintf (dump_file, "SMS doloop\n"); 1116 fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes); 1117 fprintf (dump_file, "SMS num-loads %d\n", g->num_loads); 1118 fprintf (dump_file, "SMS num-stores %d\n", g->num_stores); 1119 } 1120 1121 1122 /* In case of th loop have doloop register it gets special 1123 handling. */ 1124 count_init = NULL_RTX; 1125 if ((count_reg = doloop_register_get (head, tail))) 1126 { 1127 basic_block pre_header; 1128 1129 pre_header = loop_preheader_edge (loop)->src; 1130 count_init = const_iteration_count (count_reg, pre_header, 1131 &loop_count); 1132 } 1133 gcc_assert (count_reg); 1134 1135 if (dump_file && count_init) 1136 { 1137 fprintf (dump_file, "SMS const-doloop "); 1138 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1139 loop_count); 1140 fprintf (dump_file, "\n"); 1141 } 1142 1143 node_order = XNEWVEC (int, g->num_nodes); 1144 1145 mii = 1; /* Need to pass some estimate of mii. */ 1146 rec_mii = sms_order_nodes (g, mii, node_order, &max_asap); 1147 mii = MAX (res_MII (g), rec_mii); 1148 maxii = MAX (max_asap, MAXII_FACTOR * mii); 1149 1150 if (dump_file) 1151 fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n", 1152 rec_mii, mii, maxii); 1153 1154 /* After sms_order_nodes and before sms_schedule_by_order, to copy over 1155 ASAP. */ 1156 set_node_sched_params (g); 1157 1158 ps = sms_schedule_by_order (g, mii, maxii, node_order); 1159 1160 if (ps){ 1161 stage_count = PS_STAGE_COUNT (ps); 1162 gcc_assert(stage_count >= 1); 1163 } 1164 1165 /* Stage count of 1 means that there is no interleaving between 1166 iterations, let the scheduling passes do the job. */ 1167 if (stage_count <= 1 1168 || (count_init && (loop_count <= stage_count)) 1169 || (flag_branch_probabilities && (trip_count <= stage_count))) 1170 { 1171 if (dump_file) 1172 { 1173 fprintf (dump_file, "SMS failed... \n"); 1174 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count); 1175 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count); 1176 fprintf (dump_file, ", trip-count="); 1177 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count); 1178 fprintf (dump_file, ")\n"); 1179 } 1180 continue; 1181 } 1182 else 1183 { 1184 struct undo_replace_buff_elem *reg_move_replaces; 1185 1186 if (dump_file) 1187 { 1188 fprintf (dump_file, 1189 "SMS succeeded %d %d (with ii, sc)\n", ps->ii, 1190 stage_count); 1191 print_partial_schedule (ps, dump_file); 1192 fprintf (dump_file, 1193 "SMS Branch (%d) will later be scheduled at cycle %d.\n", 1194 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1); 1195 } 1196 1197 /* Set the stage boundaries. If the DDG is built with closing_branch_deps, 1198 the closing_branch was scheduled and should appear in the last (ii-1) 1199 row. Otherwise, we are free to schedule the branch, and we let nodes 1200 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first 1201 row; this should reduce stage_count to minimum. 1202 TODO: Revisit the issue of scheduling the insns of the 1203 control part relative to the branch when the control part 1204 has more than one insn. */ 1205 normalize_sched_times (ps); 1206 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps)); 1207 set_columns_for_ps (ps); 1208 1209 canon_loop (loop); 1210 1211 /* case the BCT count is not known , Do loop-versioning */ 1212 if (count_reg && ! count_init) 1213 { 1214 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg, 1215 GEN_INT(stage_count)); 1216 unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS 1217 * REG_BR_PROB_BASE) / 100; 1218 1219 loop_version (loop, comp_rtx, &condition_bb, 1220 prob, prob, REG_BR_PROB_BASE - prob, 1221 true); 1222 } 1223 1224 /* Set new iteration count of loop kernel. */ 1225 if (count_reg && count_init) 1226 SET_SRC (single_set (count_init)) = GEN_INT (loop_count 1227 - stage_count + 1); 1228 1229 /* Now apply the scheduled kernel to the RTL of the loop. */ 1230 permute_partial_schedule (ps, g->closing_branch->first_note); 1231 1232 /* Mark this loop as software pipelined so the later 1233 scheduling passes doesn't touch it. */ 1234 if (! flag_resched_modulo_sched) 1235 g->bb->flags |= BB_DISABLE_SCHEDULE; 1236 /* The life-info is not valid any more. */ 1237 df_set_bb_dirty (g->bb); 1238 1239 reg_move_replaces = generate_reg_moves (ps, true); 1240 if (dump_file) 1241 print_node_sched_params (dump_file, g->num_nodes, g); 1242 /* Generate prolog and epilog. */ 1243 generate_prolog_epilog (ps, loop, count_reg, count_init); 1244 1245 free_undo_replace_buff (reg_move_replaces); 1246 } 1247 1248 free_partial_schedule (ps); 1249 free (node_sched_params); 1250 free (node_order); 1251 free_ddg (g); 1252 } 1253 1254 free (g_arr); 1255 1256 /* Release scheduler data, needed until now because of DFA. */ 1257 haifa_sched_finish (); 1258 loop_optimizer_finalize (); 1259 } 1260 1261 /* The SMS scheduling algorithm itself 1262 ----------------------------------- 1263 Input: 'O' an ordered list of insns of a loop. 1264 Output: A scheduling of the loop - kernel, prolog, and epilogue. 1265 1266 'Q' is the empty Set 1267 'PS' is the partial schedule; it holds the currently scheduled nodes with 1268 their cycle/slot. 1269 'PSP' previously scheduled predecessors. 1270 'PSS' previously scheduled successors. 1271 't(u)' the cycle where u is scheduled. 1272 'l(u)' is the latency of u. 1273 'd(v,u)' is the dependence distance from v to u. 1274 'ASAP(u)' the earliest time at which u could be scheduled as computed in 1275 the node ordering phase. 1276 'check_hardware_resources_conflicts(u, PS, c)' 1277 run a trace around cycle/slot through DFA model 1278 to check resource conflicts involving instruction u 1279 at cycle c given the partial schedule PS. 1280 'add_to_partial_schedule_at_time(u, PS, c)' 1281 Add the node/instruction u to the partial schedule 1282 PS at time c. 1283 'calculate_register_pressure(PS)' 1284 Given a schedule of instructions, calculate the register 1285 pressure it implies. One implementation could be the 1286 maximum number of overlapping live ranges. 1287 'maxRP' The maximum allowed register pressure, it is usually derived from the number 1288 registers available in the hardware. 1289 1290 1. II = MII. 1291 2. PS = empty list 1292 3. for each node u in O in pre-computed order 1293 4. if (PSP(u) != Q && PSS(u) == Q) then 1294 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u). 1295 6. start = Early_start; end = Early_start + II - 1; step = 1 1296 11. else if (PSP(u) == Q && PSS(u) != Q) then 1297 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u). 1298 13. start = Late_start; end = Late_start - II + 1; step = -1 1299 14. else if (PSP(u) != Q && PSS(u) != Q) then 1300 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u). 1301 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u). 1302 17. start = Early_start; 1303 18. end = min(Early_start + II - 1 , Late_start); 1304 19. step = 1 1305 20. else "if (PSP(u) == Q && PSS(u) == Q)" 1306 21. start = ASAP(u); end = start + II - 1; step = 1 1307 22. endif 1308 1309 23. success = false 1310 24. for (c = start ; c != end ; c += step) 1311 25. if check_hardware_resources_conflicts(u, PS, c) then 1312 26. add_to_partial_schedule_at_time(u, PS, c) 1313 27. success = true 1314 28. break 1315 29. endif 1316 30. endfor 1317 31. if (success == false) then 1318 32. II = II + 1 1319 33. if (II > maxII) then 1320 34. finish - failed to schedule 1321 35. endif 1322 36. goto 2. 1323 37. endif 1324 38. endfor 1325 39. if (calculate_register_pressure(PS) > maxRP) then 1326 40. goto 32. 1327 41. endif 1328 42. compute epilogue & prologue 1329 43. finish - succeeded to schedule 1330 */ 1331 1332 /* A limit on the number of cycles that resource conflicts can span. ??? Should 1333 be provided by DFA, and be dependent on the type of insn scheduled. Currently 1334 set to 0 to save compile time. */ 1335 #define DFA_HISTORY SMS_DFA_HISTORY 1336 1337 /* A threshold for the number of repeated unsuccessful attempts to insert 1338 an empty row, before we flush the partial schedule and start over. */ 1339 #define MAX_SPLIT_NUM 10 1340 /* Given the partial schedule PS, this function calculates and returns the 1341 cycles in which we can schedule the node with the given index I. 1342 NOTE: Here we do the backtracking in SMS, in some special cases. We have 1343 noticed that there are several cases in which we fail to SMS the loop 1344 because the sched window of a node is empty due to tight data-deps. In 1345 such cases we want to unschedule some of the predecessors/successors 1346 until we get non-empty scheduling window. It returns -1 if the 1347 scheduling window is empty and zero otherwise. */ 1348 1349 static int 1350 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i, 1351 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p) 1352 { 1353 int start, step, end; 1354 ddg_edge_ptr e; 1355 int u = nodes_order [i]; 1356 ddg_node_ptr u_node = &ps->g->nodes[u]; 1357 sbitmap psp = sbitmap_alloc (ps->g->num_nodes); 1358 sbitmap pss = sbitmap_alloc (ps->g->num_nodes); 1359 sbitmap u_node_preds = NODE_PREDECESSORS (u_node); 1360 sbitmap u_node_succs = NODE_SUCCESSORS (u_node); 1361 int psp_not_empty; 1362 int pss_not_empty; 1363 1364 /* 1. compute sched window for u (start, end, step). */ 1365 sbitmap_zero (psp); 1366 sbitmap_zero (pss); 1367 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes); 1368 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes); 1369 1370 if (psp_not_empty && !pss_not_empty) 1371 { 1372 int early_start = INT_MIN; 1373 1374 end = INT_MAX; 1375 for (e = u_node->in; e != 0; e = e->next_in) 1376 { 1377 ddg_node_ptr v_node = e->src; 1378 1379 if (dump_file) 1380 { 1381 fprintf (dump_file, "\nProcessing edge: "); 1382 print_ddg_edge (dump_file, e); 1383 fprintf (dump_file, 1384 "\nScheduling %d (%d) in psp_not_empty," 1385 " checking p %d (%d): ", u_node->cuid, 1386 INSN_UID (u_node->insn), v_node->cuid, INSN_UID 1387 (v_node->insn)); 1388 } 1389 1390 if (TEST_BIT (sched_nodes, v_node->cuid)) 1391 { 1392 int p_st = SCHED_TIME (v_node); 1393 1394 early_start = 1395 MAX (early_start, p_st + e->latency - (e->distance * ii)); 1396 1397 if (dump_file) 1398 fprintf (dump_file, 1399 "pred st = %d; early_start = %d; latency: %d", 1400 p_st, early_start, e->latency); 1401 1402 if (e->data_type == MEM_DEP) 1403 end = MIN (end, SCHED_TIME (v_node) + ii - 1); 1404 } 1405 else if (dump_file) 1406 fprintf (dump_file, "the node is not scheduled\n"); 1407 } 1408 start = early_start; 1409 end = MIN (end, early_start + ii); 1410 /* Schedule the node close to it's predecessors. */ 1411 step = 1; 1412 1413 if (dump_file) 1414 fprintf (dump_file, 1415 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n", 1416 u_node->cuid, INSN_UID (u_node->insn), start, end, step); 1417 } 1418 1419 else if (!psp_not_empty && pss_not_empty) 1420 { 1421 int late_start = INT_MAX; 1422 1423 end = INT_MIN; 1424 for (e = u_node->out; e != 0; e = e->next_out) 1425 { 1426 ddg_node_ptr v_node = e->dest; 1427 1428 if (dump_file) 1429 { 1430 fprintf (dump_file, "\nProcessing edge:"); 1431 print_ddg_edge (dump_file, e); 1432 fprintf (dump_file, 1433 "\nScheduling %d (%d) in pss_not_empty," 1434 " checking s %d (%d): ", u_node->cuid, 1435 INSN_UID (u_node->insn), v_node->cuid, INSN_UID 1436 (v_node->insn)); 1437 } 1438 1439 if (TEST_BIT (sched_nodes, v_node->cuid)) 1440 { 1441 int s_st = SCHED_TIME (v_node); 1442 1443 late_start = MIN (late_start, 1444 s_st - e->latency + (e->distance * ii)); 1445 1446 if (dump_file) 1447 fprintf (dump_file, 1448 "succ st = %d; late_start = %d; latency = %d", 1449 s_st, late_start, e->latency); 1450 1451 if (e->data_type == MEM_DEP) 1452 end = MAX (end, SCHED_TIME (v_node) - ii + 1); 1453 if (dump_file) 1454 fprintf (dump_file, "end = %d\n", end); 1455 1456 } 1457 else if (dump_file) 1458 fprintf (dump_file, "the node is not scheduled\n"); 1459 1460 } 1461 start = late_start; 1462 end = MAX (end, late_start - ii); 1463 /* Schedule the node close to it's successors. */ 1464 step = -1; 1465 1466 if (dump_file) 1467 fprintf (dump_file, 1468 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n", 1469 u_node->cuid, INSN_UID (u_node->insn), start, end, step); 1470 1471 } 1472 1473 else if (psp_not_empty && pss_not_empty) 1474 { 1475 int early_start = INT_MIN; 1476 int late_start = INT_MAX; 1477 int count_preds = 0; 1478 int count_succs = 0; 1479 1480 start = INT_MIN; 1481 end = INT_MAX; 1482 for (e = u_node->in; e != 0; e = e->next_in) 1483 { 1484 ddg_node_ptr v_node = e->src; 1485 1486 if (dump_file) 1487 { 1488 fprintf (dump_file, "\nProcessing edge:"); 1489 print_ddg_edge (dump_file, e); 1490 fprintf (dump_file, 1491 "\nScheduling %d (%d) in psp_pss_not_empty," 1492 " checking p %d (%d): ", u_node->cuid, INSN_UID 1493 (u_node->insn), v_node->cuid, INSN_UID 1494 (v_node->insn)); 1495 } 1496 1497 if (TEST_BIT (sched_nodes, v_node->cuid)) 1498 { 1499 int p_st = SCHED_TIME (v_node); 1500 1501 early_start = MAX (early_start, 1502 p_st + e->latency 1503 - (e->distance * ii)); 1504 1505 if (dump_file) 1506 fprintf (dump_file, 1507 "pred st = %d; early_start = %d; latency = %d", 1508 p_st, early_start, e->latency); 1509 1510 if (e->type == TRUE_DEP && e->data_type == REG_DEP) 1511 count_preds++; 1512 1513 if (e->data_type == MEM_DEP) 1514 end = MIN (end, SCHED_TIME (v_node) + ii - 1); 1515 } 1516 else if (dump_file) 1517 fprintf (dump_file, "the node is not scheduled\n"); 1518 1519 } 1520 for (e = u_node->out; e != 0; e = e->next_out) 1521 { 1522 ddg_node_ptr v_node = e->dest; 1523 1524 if (dump_file) 1525 { 1526 fprintf (dump_file, "\nProcessing edge:"); 1527 print_ddg_edge (dump_file, e); 1528 fprintf (dump_file, 1529 "\nScheduling %d (%d) in psp_pss_not_empty," 1530 " checking s %d (%d): ", u_node->cuid, INSN_UID 1531 (u_node->insn), v_node->cuid, INSN_UID 1532 (v_node->insn)); 1533 } 1534 1535 if (TEST_BIT (sched_nodes, v_node->cuid)) 1536 { 1537 int s_st = SCHED_TIME (v_node); 1538 1539 late_start = MIN (late_start, 1540 s_st - e->latency 1541 + (e->distance * ii)); 1542 1543 if (dump_file) 1544 fprintf (dump_file, 1545 "succ st = %d; late_start = %d; latency = %d", 1546 s_st, late_start, e->latency); 1547 1548 if (e->type == TRUE_DEP && e->data_type == REG_DEP) 1549 count_succs++; 1550 1551 if (e->data_type == MEM_DEP) 1552 start = MAX (start, SCHED_TIME (v_node) - ii + 1); 1553 } 1554 else if (dump_file) 1555 fprintf (dump_file, "the node is not scheduled\n"); 1556 1557 } 1558 start = MAX (start, early_start); 1559 end = MIN (end, MIN (early_start + ii, late_start + 1)); 1560 step = 1; 1561 /* If there are more successors than predecessors schedule the 1562 node close to it's successors. */ 1563 if (count_succs >= count_preds) 1564 { 1565 int old_start = start; 1566 1567 start = end - 1; 1568 end = old_start - 1; 1569 step = -1; 1570 } 1571 } 1572 else /* psp is empty && pss is empty. */ 1573 { 1574 start = SCHED_ASAP (u_node); 1575 end = start + ii; 1576 step = 1; 1577 } 1578 1579 *start_p = start; 1580 *step_p = step; 1581 *end_p = end; 1582 sbitmap_free (psp); 1583 sbitmap_free (pss); 1584 1585 if ((start >= end && step == 1) || (start <= end && step == -1)) 1586 { 1587 if (dump_file) 1588 fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n", 1589 start, end, step); 1590 return -1; 1591 } 1592 1593 return 0; 1594 } 1595 1596 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the 1597 node currently been scheduled. At the end of the calculation 1598 MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of 1599 U_NODE which are (1) already scheduled in the first/last row of 1600 U_NODE's scheduling window, (2) whose dependence inequality with U 1601 becomes an equality when U is scheduled in this same row, and (3) 1602 whose dependence latency is zero. 1603 1604 The first and last rows are calculated using the following parameters: 1605 START/END rows - The cycles that begins/ends the traversal on the window; 1606 searching for an empty cycle to schedule U_NODE. 1607 STEP - The direction in which we traverse the window. 1608 II - The initiation interval. */ 1609 1610 static void 1611 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end, 1612 int step, int ii, sbitmap sched_nodes, 1613 sbitmap must_precede, sbitmap must_follow) 1614 { 1615 ddg_edge_ptr e; 1616 int first_cycle_in_window, last_cycle_in_window; 1617 1618 gcc_assert (must_precede && must_follow); 1619 1620 /* Consider the following scheduling window: 1621 {first_cycle_in_window, first_cycle_in_window+1, ..., 1622 last_cycle_in_window}. If step is 1 then the following will be 1623 the order we traverse the window: {start=first_cycle_in_window, 1624 first_cycle_in_window+1, ..., end=last_cycle_in_window+1}, 1625 or {start=last_cycle_in_window, last_cycle_in_window-1, ..., 1626 end=first_cycle_in_window-1} if step is -1. */ 1627 first_cycle_in_window = (step == 1) ? start : end - step; 1628 last_cycle_in_window = (step == 1) ? end - step : start; 1629 1630 sbitmap_zero (must_precede); 1631 sbitmap_zero (must_follow); 1632 1633 if (dump_file) 1634 fprintf (dump_file, "\nmust_precede: "); 1635 1636 /* Instead of checking if: 1637 (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window) 1638 && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) == 1639 first_cycle_in_window) 1640 && e->latency == 0 1641 we use the fact that latency is non-negative: 1642 SCHED_TIME (e->src) - (e->distance * ii) <= 1643 SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <= 1644 first_cycle_in_window 1645 and check only if 1646 SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */ 1647 for (e = u_node->in; e != 0; e = e->next_in) 1648 if (TEST_BIT (sched_nodes, e->src->cuid) 1649 && ((SCHED_TIME (e->src) - (e->distance * ii)) == 1650 first_cycle_in_window)) 1651 { 1652 if (dump_file) 1653 fprintf (dump_file, "%d ", e->src->cuid); 1654 1655 SET_BIT (must_precede, e->src->cuid); 1656 } 1657 1658 if (dump_file) 1659 fprintf (dump_file, "\nmust_follow: "); 1660 1661 /* Instead of checking if: 1662 (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window) 1663 && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) == 1664 last_cycle_in_window) 1665 && e->latency == 0 1666 we use the fact that latency is non-negative: 1667 SCHED_TIME (e->dest) + (e->distance * ii) >= 1668 SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >= 1669 last_cycle_in_window 1670 and check only if 1671 SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */ 1672 for (e = u_node->out; e != 0; e = e->next_out) 1673 if (TEST_BIT (sched_nodes, e->dest->cuid) 1674 && ((SCHED_TIME (e->dest) + (e->distance * ii)) == 1675 last_cycle_in_window)) 1676 { 1677 if (dump_file) 1678 fprintf (dump_file, "%d ", e->dest->cuid); 1679 1680 SET_BIT (must_follow, e->dest->cuid); 1681 } 1682 1683 if (dump_file) 1684 fprintf (dump_file, "\n"); 1685 } 1686 1687 /* Return 1 if U_NODE can be scheduled in CYCLE. Use the following 1688 parameters to decide if that's possible: 1689 PS - The partial schedule. 1690 U - The serial number of U_NODE. 1691 NUM_SPLITS - The number of row splits made so far. 1692 MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at 1693 the first row of the scheduling window) 1694 MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the 1695 last row of the scheduling window) */ 1696 1697 static bool 1698 try_scheduling_node_in_cycle (partial_schedule_ptr ps, ddg_node_ptr u_node, 1699 int u, int cycle, sbitmap sched_nodes, 1700 int *num_splits, sbitmap must_precede, 1701 sbitmap must_follow) 1702 { 1703 ps_insn_ptr psi; 1704 bool success = 0; 1705 1706 verify_partial_schedule (ps, sched_nodes); 1707 psi = ps_add_node_check_conflicts (ps, u_node, cycle, 1708 must_precede, must_follow); 1709 if (psi) 1710 { 1711 SCHED_TIME (u_node) = cycle; 1712 SET_BIT (sched_nodes, u); 1713 success = 1; 1714 *num_splits = 0; 1715 if (dump_file) 1716 fprintf (dump_file, "Scheduled w/o split in %d\n", cycle); 1717 1718 } 1719 1720 return success; 1721 } 1722 1723 /* This function implements the scheduling algorithm for SMS according to the 1724 above algorithm. */ 1725 static partial_schedule_ptr 1726 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order) 1727 { 1728 int ii = mii; 1729 int i, c, success, num_splits = 0; 1730 int flush_and_start_over = true; 1731 int num_nodes = g->num_nodes; 1732 int start, end, step; /* Place together into one struct? */ 1733 sbitmap sched_nodes = sbitmap_alloc (num_nodes); 1734 sbitmap must_precede = sbitmap_alloc (num_nodes); 1735 sbitmap must_follow = sbitmap_alloc (num_nodes); 1736 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes); 1737 1738 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY); 1739 1740 sbitmap_ones (tobe_scheduled); 1741 sbitmap_zero (sched_nodes); 1742 1743 while (flush_and_start_over && (ii < maxii)) 1744 { 1745 1746 if (dump_file) 1747 fprintf (dump_file, "Starting with ii=%d\n", ii); 1748 flush_and_start_over = false; 1749 sbitmap_zero (sched_nodes); 1750 1751 for (i = 0; i < num_nodes; i++) 1752 { 1753 int u = nodes_order[i]; 1754 ddg_node_ptr u_node = &ps->g->nodes[u]; 1755 rtx insn = u_node->insn; 1756 1757 if (!NONDEBUG_INSN_P (insn)) 1758 { 1759 RESET_BIT (tobe_scheduled, u); 1760 continue; 1761 } 1762 1763 if (JUMP_P (insn)) /* Closing branch handled later. */ 1764 { 1765 RESET_BIT (tobe_scheduled, u); 1766 continue; 1767 } 1768 1769 if (TEST_BIT (sched_nodes, u)) 1770 continue; 1771 1772 /* Try to get non-empty scheduling window. */ 1773 success = 0; 1774 if (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, 1775 &step, &end) == 0) 1776 { 1777 if (dump_file) 1778 fprintf (dump_file, "\nTrying to schedule node %d \ 1779 INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID 1780 (g->nodes[u].insn)), start, end, step); 1781 1782 gcc_assert ((step > 0 && start < end) 1783 || (step < 0 && start > end)); 1784 1785 calculate_must_precede_follow (u_node, start, end, step, ii, 1786 sched_nodes, must_precede, 1787 must_follow); 1788 1789 for (c = start; c != end; c += step) 1790 { 1791 sbitmap tmp_precede = NULL; 1792 sbitmap tmp_follow = NULL; 1793 1794 if (c == start) 1795 { 1796 if (step == 1) 1797 tmp_precede = must_precede; 1798 else /* step == -1. */ 1799 tmp_follow = must_follow; 1800 } 1801 if (c == end - step) 1802 { 1803 if (step == 1) 1804 tmp_follow = must_follow; 1805 else /* step == -1. */ 1806 tmp_precede = must_precede; 1807 } 1808 1809 success = 1810 try_scheduling_node_in_cycle (ps, u_node, u, c, 1811 sched_nodes, 1812 &num_splits, tmp_precede, 1813 tmp_follow); 1814 if (success) 1815 break; 1816 } 1817 1818 verify_partial_schedule (ps, sched_nodes); 1819 } 1820 if (!success) 1821 { 1822 int split_row; 1823 1824 if (ii++ == maxii) 1825 break; 1826 1827 if (num_splits >= MAX_SPLIT_NUM) 1828 { 1829 num_splits = 0; 1830 flush_and_start_over = true; 1831 verify_partial_schedule (ps, sched_nodes); 1832 reset_partial_schedule (ps, ii); 1833 verify_partial_schedule (ps, sched_nodes); 1834 break; 1835 } 1836 1837 num_splits++; 1838 /* The scheduling window is exclusive of 'end' 1839 whereas compute_split_window() expects an inclusive, 1840 ordered range. */ 1841 if (step == 1) 1842 split_row = compute_split_row (sched_nodes, start, end - 1, 1843 ps->ii, u_node); 1844 else 1845 split_row = compute_split_row (sched_nodes, end + 1, start, 1846 ps->ii, u_node); 1847 1848 ps_insert_empty_row (ps, split_row, sched_nodes); 1849 i--; /* Go back and retry node i. */ 1850 1851 if (dump_file) 1852 fprintf (dump_file, "num_splits=%d\n", num_splits); 1853 } 1854 1855 /* ??? If (success), check register pressure estimates. */ 1856 } /* Continue with next node. */ 1857 } /* While flush_and_start_over. */ 1858 if (ii >= maxii) 1859 { 1860 free_partial_schedule (ps); 1861 ps = NULL; 1862 } 1863 else 1864 gcc_assert (sbitmap_equal (tobe_scheduled, sched_nodes)); 1865 1866 sbitmap_free (sched_nodes); 1867 sbitmap_free (must_precede); 1868 sbitmap_free (must_follow); 1869 sbitmap_free (tobe_scheduled); 1870 1871 return ps; 1872 } 1873 1874 /* This function inserts a new empty row into PS at the position 1875 according to SPLITROW, keeping all already scheduled instructions 1876 intact and updating their SCHED_TIME and cycle accordingly. */ 1877 static void 1878 ps_insert_empty_row (partial_schedule_ptr ps, int split_row, 1879 sbitmap sched_nodes) 1880 { 1881 ps_insn_ptr crr_insn; 1882 ps_insn_ptr *rows_new; 1883 int ii = ps->ii; 1884 int new_ii = ii + 1; 1885 int row; 1886 1887 verify_partial_schedule (ps, sched_nodes); 1888 1889 /* We normalize sched_time and rotate ps to have only non-negative sched 1890 times, for simplicity of updating cycles after inserting new row. */ 1891 split_row -= ps->min_cycle; 1892 split_row = SMODULO (split_row, ii); 1893 if (dump_file) 1894 fprintf (dump_file, "split_row=%d\n", split_row); 1895 1896 normalize_sched_times (ps); 1897 rotate_partial_schedule (ps, ps->min_cycle); 1898 1899 rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr)); 1900 for (row = 0; row < split_row; row++) 1901 { 1902 rows_new[row] = ps->rows[row]; 1903 ps->rows[row] = NULL; 1904 for (crr_insn = rows_new[row]; 1905 crr_insn; crr_insn = crr_insn->next_in_row) 1906 { 1907 ddg_node_ptr u = crr_insn->node; 1908 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii); 1909 1910 SCHED_TIME (u) = new_time; 1911 crr_insn->cycle = new_time; 1912 SCHED_ROW (u) = new_time % new_ii; 1913 SCHED_STAGE (u) = new_time / new_ii; 1914 } 1915 1916 } 1917 1918 rows_new[split_row] = NULL; 1919 1920 for (row = split_row; row < ii; row++) 1921 { 1922 rows_new[row + 1] = ps->rows[row]; 1923 ps->rows[row] = NULL; 1924 for (crr_insn = rows_new[row + 1]; 1925 crr_insn; crr_insn = crr_insn->next_in_row) 1926 { 1927 ddg_node_ptr u = crr_insn->node; 1928 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1; 1929 1930 SCHED_TIME (u) = new_time; 1931 crr_insn->cycle = new_time; 1932 SCHED_ROW (u) = new_time % new_ii; 1933 SCHED_STAGE (u) = new_time / new_ii; 1934 } 1935 } 1936 1937 /* Updating ps. */ 1938 ps->min_cycle = ps->min_cycle + ps->min_cycle / ii 1939 + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0); 1940 ps->max_cycle = ps->max_cycle + ps->max_cycle / ii 1941 + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0); 1942 free (ps->rows); 1943 ps->rows = rows_new; 1944 ps->ii = new_ii; 1945 gcc_assert (ps->min_cycle >= 0); 1946 1947 verify_partial_schedule (ps, sched_nodes); 1948 1949 if (dump_file) 1950 fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle, 1951 ps->max_cycle); 1952 } 1953 1954 /* Given U_NODE which is the node that failed to be scheduled; LOW and 1955 UP which are the boundaries of it's scheduling window; compute using 1956 SCHED_NODES and II a row in the partial schedule that can be split 1957 which will separate a critical predecessor from a critical successor 1958 thereby expanding the window, and return it. */ 1959 static int 1960 compute_split_row (sbitmap sched_nodes, int low, int up, int ii, 1961 ddg_node_ptr u_node) 1962 { 1963 ddg_edge_ptr e; 1964 int lower = INT_MIN, upper = INT_MAX; 1965 ddg_node_ptr crit_pred = NULL; 1966 ddg_node_ptr crit_succ = NULL; 1967 int crit_cycle; 1968 1969 for (e = u_node->in; e != 0; e = e->next_in) 1970 { 1971 ddg_node_ptr v_node = e->src; 1972 1973 if (TEST_BIT (sched_nodes, v_node->cuid) 1974 && (low == SCHED_TIME (v_node) + e->latency - (e->distance * ii))) 1975 if (SCHED_TIME (v_node) > lower) 1976 { 1977 crit_pred = v_node; 1978 lower = SCHED_TIME (v_node); 1979 } 1980 } 1981 1982 if (crit_pred != NULL) 1983 { 1984 crit_cycle = SCHED_TIME (crit_pred) + 1; 1985 return SMODULO (crit_cycle, ii); 1986 } 1987 1988 for (e = u_node->out; e != 0; e = e->next_out) 1989 { 1990 ddg_node_ptr v_node = e->dest; 1991 if (TEST_BIT (sched_nodes, v_node->cuid) 1992 && (up == SCHED_TIME (v_node) - e->latency + (e->distance * ii))) 1993 if (SCHED_TIME (v_node) < upper) 1994 { 1995 crit_succ = v_node; 1996 upper = SCHED_TIME (v_node); 1997 } 1998 } 1999 2000 if (crit_succ != NULL) 2001 { 2002 crit_cycle = SCHED_TIME (crit_succ); 2003 return SMODULO (crit_cycle, ii); 2004 } 2005 2006 if (dump_file) 2007 fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n"); 2008 2009 return SMODULO ((low + up + 1) / 2, ii); 2010 } 2011 2012 static void 2013 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes) 2014 { 2015 int row; 2016 ps_insn_ptr crr_insn; 2017 2018 for (row = 0; row < ps->ii; row++) 2019 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row) 2020 { 2021 ddg_node_ptr u = crr_insn->node; 2022 2023 gcc_assert (TEST_BIT (sched_nodes, u->cuid)); 2024 /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by 2025 popcount (sched_nodes) == number of insns in ps. */ 2026 gcc_assert (SCHED_TIME (u) >= ps->min_cycle); 2027 gcc_assert (SCHED_TIME (u) <= ps->max_cycle); 2028 } 2029 } 2030 2031 2032 /* This page implements the algorithm for ordering the nodes of a DDG 2033 for modulo scheduling, activated through the 2034 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */ 2035 2036 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info) 2037 #define ASAP(x) (ORDER_PARAMS ((x))->asap) 2038 #define ALAP(x) (ORDER_PARAMS ((x))->alap) 2039 #define HEIGHT(x) (ORDER_PARAMS ((x))->height) 2040 #define MOB(x) (ALAP ((x)) - ASAP ((x))) 2041 #define DEPTH(x) (ASAP ((x))) 2042 2043 typedef struct node_order_params * nopa; 2044 2045 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result); 2046 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int); 2047 static nopa calculate_order_params (ddg_ptr, int, int *); 2048 static int find_max_asap (ddg_ptr, sbitmap); 2049 static int find_max_hv_min_mob (ddg_ptr, sbitmap); 2050 static int find_max_dv_min_mob (ddg_ptr, sbitmap); 2051 2052 enum sms_direction {BOTTOMUP, TOPDOWN}; 2053 2054 struct node_order_params 2055 { 2056 int asap; 2057 int alap; 2058 int height; 2059 }; 2060 2061 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */ 2062 static void 2063 check_nodes_order (int *node_order, int num_nodes) 2064 { 2065 int i; 2066 sbitmap tmp = sbitmap_alloc (num_nodes); 2067 2068 sbitmap_zero (tmp); 2069 2070 if (dump_file) 2071 fprintf (dump_file, "SMS final nodes order: \n"); 2072 2073 for (i = 0; i < num_nodes; i++) 2074 { 2075 int u = node_order[i]; 2076 2077 if (dump_file) 2078 fprintf (dump_file, "%d ", u); 2079 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u)); 2080 2081 SET_BIT (tmp, u); 2082 } 2083 2084 if (dump_file) 2085 fprintf (dump_file, "\n"); 2086 2087 sbitmap_free (tmp); 2088 } 2089 2090 /* Order the nodes of G for scheduling and pass the result in 2091 NODE_ORDER. Also set aux.count of each node to ASAP. 2092 Put maximal ASAP to PMAX_ASAP. Return the recMII for the given DDG. */ 2093 static int 2094 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap) 2095 { 2096 int i; 2097 int rec_mii = 0; 2098 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g); 2099 2100 nopa nops = calculate_order_params (g, mii, pmax_asap); 2101 2102 if (dump_file) 2103 print_sccs (dump_file, sccs, g); 2104 2105 order_nodes_of_sccs (sccs, node_order); 2106 2107 if (sccs->num_sccs > 0) 2108 /* First SCC has the largest recurrence_length. */ 2109 rec_mii = sccs->sccs[0]->recurrence_length; 2110 2111 /* Save ASAP before destroying node_order_params. */ 2112 for (i = 0; i < g->num_nodes; i++) 2113 { 2114 ddg_node_ptr v = &g->nodes[i]; 2115 v->aux.count = ASAP (v); 2116 } 2117 2118 free (nops); 2119 free_ddg_all_sccs (sccs); 2120 check_nodes_order (node_order, g->num_nodes); 2121 2122 return rec_mii; 2123 } 2124 2125 static void 2126 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order) 2127 { 2128 int i, pos = 0; 2129 ddg_ptr g = all_sccs->ddg; 2130 int num_nodes = g->num_nodes; 2131 sbitmap prev_sccs = sbitmap_alloc (num_nodes); 2132 sbitmap on_path = sbitmap_alloc (num_nodes); 2133 sbitmap tmp = sbitmap_alloc (num_nodes); 2134 sbitmap ones = sbitmap_alloc (num_nodes); 2135 2136 sbitmap_zero (prev_sccs); 2137 sbitmap_ones (ones); 2138 2139 /* Perform the node ordering starting from the SCC with the highest recMII. 2140 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */ 2141 for (i = 0; i < all_sccs->num_sccs; i++) 2142 { 2143 ddg_scc_ptr scc = all_sccs->sccs[i]; 2144 2145 /* Add nodes on paths from previous SCCs to the current SCC. */ 2146 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes); 2147 sbitmap_a_or_b (tmp, scc->nodes, on_path); 2148 2149 /* Add nodes on paths from the current SCC to previous SCCs. */ 2150 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs); 2151 sbitmap_a_or_b (tmp, tmp, on_path); 2152 2153 /* Remove nodes of previous SCCs from current extended SCC. */ 2154 sbitmap_difference (tmp, tmp, prev_sccs); 2155 2156 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos); 2157 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */ 2158 } 2159 2160 /* Handle the remaining nodes that do not belong to any scc. Each call 2161 to order_nodes_in_scc handles a single connected component. */ 2162 while (pos < g->num_nodes) 2163 { 2164 sbitmap_difference (tmp, ones, prev_sccs); 2165 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos); 2166 } 2167 sbitmap_free (prev_sccs); 2168 sbitmap_free (on_path); 2169 sbitmap_free (tmp); 2170 sbitmap_free (ones); 2171 } 2172 2173 /* MII is needed if we consider backarcs (that do not close recursive cycles). */ 2174 static struct node_order_params * 2175 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap) 2176 { 2177 int u; 2178 int max_asap; 2179 int num_nodes = g->num_nodes; 2180 ddg_edge_ptr e; 2181 /* Allocate a place to hold ordering params for each node in the DDG. */ 2182 nopa node_order_params_arr; 2183 2184 /* Initialize of ASAP/ALAP/HEIGHT to zero. */ 2185 node_order_params_arr = (nopa) xcalloc (num_nodes, 2186 sizeof (struct node_order_params)); 2187 2188 /* Set the aux pointer of each node to point to its order_params structure. */ 2189 for (u = 0; u < num_nodes; u++) 2190 g->nodes[u].aux.info = &node_order_params_arr[u]; 2191 2192 /* Disregarding a backarc from each recursive cycle to obtain a DAG, 2193 calculate ASAP, ALAP, mobility, distance, and height for each node 2194 in the dependence (direct acyclic) graph. */ 2195 2196 /* We assume that the nodes in the array are in topological order. */ 2197 2198 max_asap = 0; 2199 for (u = 0; u < num_nodes; u++) 2200 { 2201 ddg_node_ptr u_node = &g->nodes[u]; 2202 2203 ASAP (u_node) = 0; 2204 for (e = u_node->in; e; e = e->next_in) 2205 if (e->distance == 0) 2206 ASAP (u_node) = MAX (ASAP (u_node), 2207 ASAP (e->src) + e->latency); 2208 max_asap = MAX (max_asap, ASAP (u_node)); 2209 } 2210 2211 for (u = num_nodes - 1; u > -1; u--) 2212 { 2213 ddg_node_ptr u_node = &g->nodes[u]; 2214 2215 ALAP (u_node) = max_asap; 2216 HEIGHT (u_node) = 0; 2217 for (e = u_node->out; e; e = e->next_out) 2218 if (e->distance == 0) 2219 { 2220 ALAP (u_node) = MIN (ALAP (u_node), 2221 ALAP (e->dest) - e->latency); 2222 HEIGHT (u_node) = MAX (HEIGHT (u_node), 2223 HEIGHT (e->dest) + e->latency); 2224 } 2225 } 2226 if (dump_file) 2227 { 2228 fprintf (dump_file, "\nOrder params\n"); 2229 for (u = 0; u < num_nodes; u++) 2230 { 2231 ddg_node_ptr u_node = &g->nodes[u]; 2232 2233 fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u, 2234 ASAP (u_node), ALAP (u_node), HEIGHT (u_node)); 2235 } 2236 } 2237 2238 *pmax_asap = max_asap; 2239 return node_order_params_arr; 2240 } 2241 2242 static int 2243 find_max_asap (ddg_ptr g, sbitmap nodes) 2244 { 2245 unsigned int u = 0; 2246 int max_asap = -1; 2247 int result = -1; 2248 sbitmap_iterator sbi; 2249 2250 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 2251 { 2252 ddg_node_ptr u_node = &g->nodes[u]; 2253 2254 if (max_asap < ASAP (u_node)) 2255 { 2256 max_asap = ASAP (u_node); 2257 result = u; 2258 } 2259 } 2260 return result; 2261 } 2262 2263 static int 2264 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes) 2265 { 2266 unsigned int u = 0; 2267 int max_hv = -1; 2268 int min_mob = INT_MAX; 2269 int result = -1; 2270 sbitmap_iterator sbi; 2271 2272 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 2273 { 2274 ddg_node_ptr u_node = &g->nodes[u]; 2275 2276 if (max_hv < HEIGHT (u_node)) 2277 { 2278 max_hv = HEIGHT (u_node); 2279 min_mob = MOB (u_node); 2280 result = u; 2281 } 2282 else if ((max_hv == HEIGHT (u_node)) 2283 && (min_mob > MOB (u_node))) 2284 { 2285 min_mob = MOB (u_node); 2286 result = u; 2287 } 2288 } 2289 return result; 2290 } 2291 2292 static int 2293 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes) 2294 { 2295 unsigned int u = 0; 2296 int max_dv = -1; 2297 int min_mob = INT_MAX; 2298 int result = -1; 2299 sbitmap_iterator sbi; 2300 2301 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 2302 { 2303 ddg_node_ptr u_node = &g->nodes[u]; 2304 2305 if (max_dv < DEPTH (u_node)) 2306 { 2307 max_dv = DEPTH (u_node); 2308 min_mob = MOB (u_node); 2309 result = u; 2310 } 2311 else if ((max_dv == DEPTH (u_node)) 2312 && (min_mob > MOB (u_node))) 2313 { 2314 min_mob = MOB (u_node); 2315 result = u; 2316 } 2317 } 2318 return result; 2319 } 2320 2321 /* Places the nodes of SCC into the NODE_ORDER array starting 2322 at position POS, according to the SMS ordering algorithm. 2323 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in 2324 the NODE_ORDER array, starting from position zero. */ 2325 static int 2326 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc, 2327 int * node_order, int pos) 2328 { 2329 enum sms_direction dir; 2330 int num_nodes = g->num_nodes; 2331 sbitmap workset = sbitmap_alloc (num_nodes); 2332 sbitmap tmp = sbitmap_alloc (num_nodes); 2333 sbitmap zero_bitmap = sbitmap_alloc (num_nodes); 2334 sbitmap predecessors = sbitmap_alloc (num_nodes); 2335 sbitmap successors = sbitmap_alloc (num_nodes); 2336 2337 sbitmap_zero (predecessors); 2338 find_predecessors (predecessors, g, nodes_ordered); 2339 2340 sbitmap_zero (successors); 2341 find_successors (successors, g, nodes_ordered); 2342 2343 sbitmap_zero (tmp); 2344 if (sbitmap_a_and_b_cg (tmp, predecessors, scc)) 2345 { 2346 sbitmap_copy (workset, tmp); 2347 dir = BOTTOMUP; 2348 } 2349 else if (sbitmap_a_and_b_cg (tmp, successors, scc)) 2350 { 2351 sbitmap_copy (workset, tmp); 2352 dir = TOPDOWN; 2353 } 2354 else 2355 { 2356 int u; 2357 2358 sbitmap_zero (workset); 2359 if ((u = find_max_asap (g, scc)) >= 0) 2360 SET_BIT (workset, u); 2361 dir = BOTTOMUP; 2362 } 2363 2364 sbitmap_zero (zero_bitmap); 2365 while (!sbitmap_equal (workset, zero_bitmap)) 2366 { 2367 int v; 2368 ddg_node_ptr v_node; 2369 sbitmap v_node_preds; 2370 sbitmap v_node_succs; 2371 2372 if (dir == TOPDOWN) 2373 { 2374 while (!sbitmap_equal (workset, zero_bitmap)) 2375 { 2376 v = find_max_hv_min_mob (g, workset); 2377 v_node = &g->nodes[v]; 2378 node_order[pos++] = v; 2379 v_node_succs = NODE_SUCCESSORS (v_node); 2380 sbitmap_a_and_b (tmp, v_node_succs, scc); 2381 2382 /* Don't consider the already ordered successors again. */ 2383 sbitmap_difference (tmp, tmp, nodes_ordered); 2384 sbitmap_a_or_b (workset, workset, tmp); 2385 RESET_BIT (workset, v); 2386 SET_BIT (nodes_ordered, v); 2387 } 2388 dir = BOTTOMUP; 2389 sbitmap_zero (predecessors); 2390 find_predecessors (predecessors, g, nodes_ordered); 2391 sbitmap_a_and_b (workset, predecessors, scc); 2392 } 2393 else 2394 { 2395 while (!sbitmap_equal (workset, zero_bitmap)) 2396 { 2397 v = find_max_dv_min_mob (g, workset); 2398 v_node = &g->nodes[v]; 2399 node_order[pos++] = v; 2400 v_node_preds = NODE_PREDECESSORS (v_node); 2401 sbitmap_a_and_b (tmp, v_node_preds, scc); 2402 2403 /* Don't consider the already ordered predecessors again. */ 2404 sbitmap_difference (tmp, tmp, nodes_ordered); 2405 sbitmap_a_or_b (workset, workset, tmp); 2406 RESET_BIT (workset, v); 2407 SET_BIT (nodes_ordered, v); 2408 } 2409 dir = TOPDOWN; 2410 sbitmap_zero (successors); 2411 find_successors (successors, g, nodes_ordered); 2412 sbitmap_a_and_b (workset, successors, scc); 2413 } 2414 } 2415 sbitmap_free (tmp); 2416 sbitmap_free (workset); 2417 sbitmap_free (zero_bitmap); 2418 sbitmap_free (predecessors); 2419 sbitmap_free (successors); 2420 return pos; 2421 } 2422 2423 2424 /* This page contains functions for manipulating partial-schedules during 2425 modulo scheduling. */ 2426 2427 /* Create a partial schedule and allocate a memory to hold II rows. */ 2428 2429 static partial_schedule_ptr 2430 create_partial_schedule (int ii, ddg_ptr g, int history) 2431 { 2432 partial_schedule_ptr ps = XNEW (struct partial_schedule); 2433 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr)); 2434 ps->ii = ii; 2435 ps->history = history; 2436 ps->min_cycle = INT_MAX; 2437 ps->max_cycle = INT_MIN; 2438 ps->g = g; 2439 2440 return ps; 2441 } 2442 2443 /* Free the PS_INSNs in rows array of the given partial schedule. 2444 ??? Consider caching the PS_INSN's. */ 2445 static void 2446 free_ps_insns (partial_schedule_ptr ps) 2447 { 2448 int i; 2449 2450 for (i = 0; i < ps->ii; i++) 2451 { 2452 while (ps->rows[i]) 2453 { 2454 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row; 2455 2456 free (ps->rows[i]); 2457 ps->rows[i] = ps_insn; 2458 } 2459 ps->rows[i] = NULL; 2460 } 2461 } 2462 2463 /* Free all the memory allocated to the partial schedule. */ 2464 2465 static void 2466 free_partial_schedule (partial_schedule_ptr ps) 2467 { 2468 if (!ps) 2469 return; 2470 free_ps_insns (ps); 2471 free (ps->rows); 2472 free (ps); 2473 } 2474 2475 /* Clear the rows array with its PS_INSNs, and create a new one with 2476 NEW_II rows. */ 2477 2478 static void 2479 reset_partial_schedule (partial_schedule_ptr ps, int new_ii) 2480 { 2481 if (!ps) 2482 return; 2483 free_ps_insns (ps); 2484 if (new_ii == ps->ii) 2485 return; 2486 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii 2487 * sizeof (ps_insn_ptr)); 2488 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr)); 2489 ps->ii = new_ii; 2490 ps->min_cycle = INT_MAX; 2491 ps->max_cycle = INT_MIN; 2492 } 2493 2494 /* Prints the partial schedule as an ii rows array, for each rows 2495 print the ids of the insns in it. */ 2496 void 2497 print_partial_schedule (partial_schedule_ptr ps, FILE *dump) 2498 { 2499 int i; 2500 2501 for (i = 0; i < ps->ii; i++) 2502 { 2503 ps_insn_ptr ps_i = ps->rows[i]; 2504 2505 fprintf (dump, "\n[ROW %d ]: ", i); 2506 while (ps_i) 2507 { 2508 fprintf (dump, "%d, ", 2509 INSN_UID (ps_i->node->insn)); 2510 ps_i = ps_i->next_in_row; 2511 } 2512 } 2513 } 2514 2515 /* Creates an object of PS_INSN and initializes it to the given parameters. */ 2516 static ps_insn_ptr 2517 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle) 2518 { 2519 ps_insn_ptr ps_i = XNEW (struct ps_insn); 2520 2521 ps_i->node = node; 2522 ps_i->next_in_row = NULL; 2523 ps_i->prev_in_row = NULL; 2524 ps_i->row_rest_count = rest_count; 2525 ps_i->cycle = cycle; 2526 2527 return ps_i; 2528 } 2529 2530 2531 /* Removes the given PS_INSN from the partial schedule. Returns false if the 2532 node is not found in the partial schedule, else returns true. */ 2533 static bool 2534 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i) 2535 { 2536 int row; 2537 2538 if (!ps || !ps_i) 2539 return false; 2540 2541 row = SMODULO (ps_i->cycle, ps->ii); 2542 if (! ps_i->prev_in_row) 2543 { 2544 if (ps_i != ps->rows[row]) 2545 return false; 2546 2547 ps->rows[row] = ps_i->next_in_row; 2548 if (ps->rows[row]) 2549 ps->rows[row]->prev_in_row = NULL; 2550 } 2551 else 2552 { 2553 ps_i->prev_in_row->next_in_row = ps_i->next_in_row; 2554 if (ps_i->next_in_row) 2555 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row; 2556 } 2557 free (ps_i); 2558 return true; 2559 } 2560 2561 /* Unlike what literature describes for modulo scheduling (which focuses 2562 on VLIW machines) the order of the instructions inside a cycle is 2563 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know 2564 where the current instruction should go relative to the already 2565 scheduled instructions in the given cycle. Go over these 2566 instructions and find the first possible column to put it in. */ 2567 static bool 2568 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i, 2569 sbitmap must_precede, sbitmap must_follow) 2570 { 2571 ps_insn_ptr next_ps_i; 2572 ps_insn_ptr first_must_follow = NULL; 2573 ps_insn_ptr last_must_precede = NULL; 2574 int row; 2575 2576 if (! ps_i) 2577 return false; 2578 2579 row = SMODULO (ps_i->cycle, ps->ii); 2580 2581 /* Find the first must follow and the last must precede 2582 and insert the node immediately after the must precede 2583 but make sure that it there is no must follow after it. */ 2584 for (next_ps_i = ps->rows[row]; 2585 next_ps_i; 2586 next_ps_i = next_ps_i->next_in_row) 2587 { 2588 if (must_follow && TEST_BIT (must_follow, next_ps_i->node->cuid) 2589 && ! first_must_follow) 2590 first_must_follow = next_ps_i; 2591 if (must_precede && TEST_BIT (must_precede, next_ps_i->node->cuid)) 2592 { 2593 /* If we have already met a node that must follow, then 2594 there is no possible column. */ 2595 if (first_must_follow) 2596 return false; 2597 else 2598 last_must_precede = next_ps_i; 2599 } 2600 } 2601 2602 /* Now insert the node after INSERT_AFTER_PSI. */ 2603 2604 if (! last_must_precede) 2605 { 2606 ps_i->next_in_row = ps->rows[row]; 2607 ps_i->prev_in_row = NULL; 2608 if (ps_i->next_in_row) 2609 ps_i->next_in_row->prev_in_row = ps_i; 2610 ps->rows[row] = ps_i; 2611 } 2612 else 2613 { 2614 ps_i->next_in_row = last_must_precede->next_in_row; 2615 last_must_precede->next_in_row = ps_i; 2616 ps_i->prev_in_row = last_must_precede; 2617 if (ps_i->next_in_row) 2618 ps_i->next_in_row->prev_in_row = ps_i; 2619 } 2620 2621 return true; 2622 } 2623 2624 /* Advances the PS_INSN one column in its current row; returns false 2625 in failure and true in success. Bit N is set in MUST_FOLLOW if 2626 the node with cuid N must be come after the node pointed to by 2627 PS_I when scheduled in the same cycle. */ 2628 static int 2629 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i, 2630 sbitmap must_follow) 2631 { 2632 ps_insn_ptr prev, next; 2633 int row; 2634 ddg_node_ptr next_node; 2635 2636 if (!ps || !ps_i) 2637 return false; 2638 2639 row = SMODULO (ps_i->cycle, ps->ii); 2640 2641 if (! ps_i->next_in_row) 2642 return false; 2643 2644 next_node = ps_i->next_in_row->node; 2645 2646 /* Check if next_in_row is dependent on ps_i, both having same sched 2647 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */ 2648 if (must_follow && TEST_BIT (must_follow, next_node->cuid)) 2649 return false; 2650 2651 /* Advance PS_I over its next_in_row in the doubly linked list. */ 2652 prev = ps_i->prev_in_row; 2653 next = ps_i->next_in_row; 2654 2655 if (ps_i == ps->rows[row]) 2656 ps->rows[row] = next; 2657 2658 ps_i->next_in_row = next->next_in_row; 2659 2660 if (next->next_in_row) 2661 next->next_in_row->prev_in_row = ps_i; 2662 2663 next->next_in_row = ps_i; 2664 ps_i->prev_in_row = next; 2665 2666 next->prev_in_row = prev; 2667 if (prev) 2668 prev->next_in_row = next; 2669 2670 return true; 2671 } 2672 2673 /* Inserts a DDG_NODE to the given partial schedule at the given cycle. 2674 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is 2675 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come 2676 before/after (respectively) the node pointed to by PS_I when scheduled 2677 in the same cycle. */ 2678 static ps_insn_ptr 2679 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle, 2680 sbitmap must_precede, sbitmap must_follow) 2681 { 2682 ps_insn_ptr ps_i; 2683 int rest_count = 1; 2684 int row = SMODULO (cycle, ps->ii); 2685 2686 if (ps->rows[row] 2687 && ps->rows[row]->row_rest_count >= issue_rate) 2688 return NULL; 2689 2690 if (ps->rows[row]) 2691 rest_count += ps->rows[row]->row_rest_count; 2692 2693 ps_i = create_ps_insn (node, rest_count, cycle); 2694 2695 /* Finds and inserts PS_I according to MUST_FOLLOW and 2696 MUST_PRECEDE. */ 2697 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow)) 2698 { 2699 free (ps_i); 2700 return NULL; 2701 } 2702 2703 return ps_i; 2704 } 2705 2706 /* Advance time one cycle. Assumes DFA is being used. */ 2707 static void 2708 advance_one_cycle (void) 2709 { 2710 if (targetm.sched.dfa_pre_cycle_insn) 2711 state_transition (curr_state, 2712 targetm.sched.dfa_pre_cycle_insn ()); 2713 2714 state_transition (curr_state, NULL); 2715 2716 if (targetm.sched.dfa_post_cycle_insn) 2717 state_transition (curr_state, 2718 targetm.sched.dfa_post_cycle_insn ()); 2719 } 2720 2721 2722 2723 /* Checks if PS has resource conflicts according to DFA, starting from 2724 FROM cycle to TO cycle; returns true if there are conflicts and false 2725 if there are no conflicts. Assumes DFA is being used. */ 2726 static int 2727 ps_has_conflicts (partial_schedule_ptr ps, int from, int to) 2728 { 2729 int cycle; 2730 2731 state_reset (curr_state); 2732 2733 for (cycle = from; cycle <= to; cycle++) 2734 { 2735 ps_insn_ptr crr_insn; 2736 /* Holds the remaining issue slots in the current row. */ 2737 int can_issue_more = issue_rate; 2738 2739 /* Walk through the DFA for the current row. */ 2740 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)]; 2741 crr_insn; 2742 crr_insn = crr_insn->next_in_row) 2743 { 2744 rtx insn = crr_insn->node->insn; 2745 2746 if (!NONDEBUG_INSN_P (insn)) 2747 continue; 2748 2749 /* Check if there is room for the current insn. */ 2750 if (!can_issue_more || state_dead_lock_p (curr_state)) 2751 return true; 2752 2753 /* Update the DFA state and return with failure if the DFA found 2754 resource conflicts. */ 2755 if (state_transition (curr_state, insn) >= 0) 2756 return true; 2757 2758 if (targetm.sched.variable_issue) 2759 can_issue_more = 2760 targetm.sched.variable_issue (sched_dump, sched_verbose, 2761 insn, can_issue_more); 2762 /* A naked CLOBBER or USE generates no instruction, so don't 2763 let them consume issue slots. */ 2764 else if (GET_CODE (PATTERN (insn)) != USE 2765 && GET_CODE (PATTERN (insn)) != CLOBBER) 2766 can_issue_more--; 2767 } 2768 2769 /* Advance the DFA to the next cycle. */ 2770 advance_one_cycle (); 2771 } 2772 return false; 2773 } 2774 2775 /* Checks if the given node causes resource conflicts when added to PS at 2776 cycle C. If not the node is added to PS and returned; otherwise zero 2777 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with 2778 cuid N must be come before/after (respectively) the node pointed to by 2779 PS_I when scheduled in the same cycle. */ 2780 ps_insn_ptr 2781 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n, 2782 int c, sbitmap must_precede, 2783 sbitmap must_follow) 2784 { 2785 int has_conflicts = 0; 2786 ps_insn_ptr ps_i; 2787 2788 /* First add the node to the PS, if this succeeds check for 2789 conflicts, trying different issue slots in the same row. */ 2790 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow))) 2791 return NULL; /* Failed to insert the node at the given cycle. */ 2792 2793 has_conflicts = ps_has_conflicts (ps, c, c) 2794 || (ps->history > 0 2795 && ps_has_conflicts (ps, 2796 c - ps->history, 2797 c + ps->history)); 2798 2799 /* Try different issue slots to find one that the given node can be 2800 scheduled in without conflicts. */ 2801 while (has_conflicts) 2802 { 2803 if (! ps_insn_advance_column (ps, ps_i, must_follow)) 2804 break; 2805 has_conflicts = ps_has_conflicts (ps, c, c) 2806 || (ps->history > 0 2807 && ps_has_conflicts (ps, 2808 c - ps->history, 2809 c + ps->history)); 2810 } 2811 2812 if (has_conflicts) 2813 { 2814 remove_node_from_ps (ps, ps_i); 2815 return NULL; 2816 } 2817 2818 ps->min_cycle = MIN (ps->min_cycle, c); 2819 ps->max_cycle = MAX (ps->max_cycle, c); 2820 return ps_i; 2821 } 2822 2823 /* Rotate the rows of PS such that insns scheduled at time 2824 START_CYCLE will appear in row 0. Updates max/min_cycles. */ 2825 void 2826 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle) 2827 { 2828 int i, row, backward_rotates; 2829 int last_row = ps->ii - 1; 2830 2831 if (start_cycle == 0) 2832 return; 2833 2834 backward_rotates = SMODULO (start_cycle, ps->ii); 2835 2836 /* Revisit later and optimize this into a single loop. */ 2837 for (i = 0; i < backward_rotates; i++) 2838 { 2839 ps_insn_ptr first_row = ps->rows[0]; 2840 2841 for (row = 0; row < last_row; row++) 2842 ps->rows[row] = ps->rows[row+1]; 2843 2844 ps->rows[last_row] = first_row; 2845 } 2846 2847 ps->max_cycle -= start_cycle; 2848 ps->min_cycle -= start_cycle; 2849 } 2850 2851 #endif /* INSN_SCHEDULING */ 2852 2853 static bool 2854 gate_handle_sms (void) 2855 { 2856 return (optimize > 0 && flag_modulo_sched); 2857 } 2858 2859 2860 /* Run instruction scheduler. */ 2861 /* Perform SMS module scheduling. */ 2862 static unsigned int 2863 rest_of_handle_sms (void) 2864 { 2865 #ifdef INSN_SCHEDULING 2866 basic_block bb; 2867 2868 /* Collect loop information to be used in SMS. */ 2869 cfg_layout_initialize (0); 2870 sms_schedule (); 2871 2872 /* Update the life information, because we add pseudos. */ 2873 max_regno = max_reg_num (); 2874 2875 /* Finalize layout changes. */ 2876 FOR_EACH_BB (bb) 2877 if (bb->next_bb != EXIT_BLOCK_PTR) 2878 bb->aux = bb->next_bb; 2879 free_dominance_info (CDI_DOMINATORS); 2880 cfg_layout_finalize (); 2881 #endif /* INSN_SCHEDULING */ 2882 return 0; 2883 } 2884 2885 struct rtl_opt_pass pass_sms = 2886 { 2887 { 2888 RTL_PASS, 2889 "sms", /* name */ 2890 gate_handle_sms, /* gate */ 2891 rest_of_handle_sms, /* execute */ 2892 NULL, /* sub */ 2893 NULL, /* next */ 2894 0, /* static_pass_number */ 2895 TV_SMS, /* tv_id */ 2896 0, /* properties_required */ 2897 0, /* properties_provided */ 2898 0, /* properties_destroyed */ 2899 TODO_dump_func, /* todo_flags_start */ 2900 TODO_df_finish | TODO_verify_rtl_sharing | 2901 TODO_dump_func | 2902 TODO_ggc_collect /* todo_flags_finish */ 2903 } 2904 }; 2905 2906