xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/modulo-sched.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1 /* Swing Modulo Scheduling implementation.
2    Copyright (C) 2004-2020 Free Software Foundation, Inc.
3    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4 
5 This file is part of GCC.
6 
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11 
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3.  If not see
19 <http://www.gnu.org/licenses/>.  */
20 
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "optabs.h"
33 #include "regs.h"
34 #include "emit-rtl.h"
35 #include "gcov-io.h"
36 #include "profile.h"
37 #include "insn-attr.h"
38 #include "cfgrtl.h"
39 #include "sched-int.h"
40 #include "cfgloop.h"
41 #include "expr.h"
42 #include "ddg.h"
43 #include "tree-pass.h"
44 #include "dbgcnt.h"
45 #include "loop-unroll.h"
46 #include "hard-reg-set.h"
47 
48 #ifdef INSN_SCHEDULING
49 
50 /* This file contains the implementation of the Swing Modulo Scheduler,
51    described in the following references:
52    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
53        Lifetime--sensitive modulo scheduling in a production environment.
54        IEEE Trans. on Comps., 50(3), March 2001
55    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
56        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
57        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
58 
59    The basic structure is:
60    1. Build a data-dependence graph (DDG) for each loop.
61    2. Use the DDG to order the insns of a loop (not in topological order
62       necessarily, but rather) trying to place each insn after all its
63       predecessors _or_ after all its successors.
64    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
65    4. Use the ordering to perform list-scheduling of the loop:
66       1. Set II = MII.  We will try to schedule the loop within II cycles.
67       2. Try to schedule the insns one by one according to the ordering.
68 	 For each insn compute an interval of cycles by considering already-
69 	 scheduled preds and succs (and associated latencies); try to place
70 	 the insn in the cycles of this window checking for potential
71 	 resource conflicts (using the DFA interface).
72 	 Note: this is different from the cycle-scheduling of schedule_insns;
73 	 here the insns are not scheduled monotonically top-down (nor bottom-
74 	 up).
75       3. If failed in scheduling all insns - bump II++ and try again, unless
76 	 II reaches an upper bound MaxII, in which case report failure.
77    5. If we succeeded in scheduling the loop within II cycles, we now
78       generate prolog and epilog, decrease the counter of the loop, and
79       perform modulo variable expansion for live ranges that span more than
80       II cycles (i.e. use register copies to prevent a def from overwriting
81       itself before reaching the use).
82 
83     SMS works with countable loops (1) whose control part can be easily
84     decoupled from the rest of the loop and (2) whose loop count can
85     be easily adjusted.  This is because we peel a constant number of
86     iterations into a prologue and epilogue for which we want to avoid
87     emitting the control part, and a kernel which is to iterate that
88     constant number of iterations less than the original loop.  So the
89     control part should be a set of insns clearly identified and having
90     its own iv, not otherwise used in the loop (at-least for now), which
91     initializes a register before the loop to the number of iterations.
92     Currently SMS relies on the do-loop pattern to recognize such loops,
93     where (1) the control part comprises of all insns defining and/or
94     using a certain 'count' register and (2) the loop count can be
95     adjusted by modifying this register prior to the loop.
96     TODO: Rely on cfgloop analysis instead.  */
97 
98 /* This page defines partial-schedule structures and functions for
99    modulo scheduling.  */
100 
101 typedef struct partial_schedule *partial_schedule_ptr;
102 typedef struct ps_insn *ps_insn_ptr;
103 
104 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
105 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
106 
107 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
108 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
109 
110 /* Perform signed modulo, always returning a non-negative value.  */
111 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
112 
113 /* The number of different iterations the nodes in ps span, assuming
114    the stage boundaries are placed efficiently.  */
115 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
116                          + 1 + ii - 1) / ii)
117 /* The stage count of ps.  */
118 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
119 
120 /* A single instruction in the partial schedule.  */
121 struct ps_insn
122 {
123   /* Identifies the instruction to be scheduled.  Values smaller than
124      the ddg's num_nodes refer directly to ddg nodes.  A value of
125      X - num_nodes refers to register move X.  */
126   int id;
127 
128   /* The (absolute) cycle in which the PS instruction is scheduled.
129      Same as SCHED_TIME (node).  */
130   int cycle;
131 
132   /* The next/prev PS_INSN in the same row.  */
133   ps_insn_ptr next_in_row,
134 	      prev_in_row;
135 
136 };
137 
138 /* Information about a register move that has been added to a partial
139    schedule.  */
140 struct ps_reg_move_info
141 {
142   /* The source of the move is defined by the ps_insn with id DEF.
143      The destination is used by the ps_insns with the ids in USES.  */
144   int def;
145   sbitmap uses;
146 
147   /* The original form of USES' instructions used OLD_REG, but they
148      should now use NEW_REG.  */
149   rtx old_reg;
150   rtx new_reg;
151 
152   /* The number of consecutive stages that the move occupies.  */
153   int num_consecutive_stages;
154 
155   /* An instruction that sets NEW_REG to the correct value.  The first
156      move associated with DEF will have an rhs of OLD_REG; later moves
157      use the result of the previous move.  */
158   rtx_insn *insn;
159 };
160 
161 /* Holds the partial schedule as an array of II rows.  Each entry of the
162    array points to a linked list of PS_INSNs, which represents the
163    instructions that are scheduled for that row.  */
164 struct partial_schedule
165 {
166   int ii;	/* Number of rows in the partial schedule.  */
167   int history;  /* Threshold for conflict checking using DFA.  */
168 
169   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
170   ps_insn_ptr *rows;
171 
172   /* All the moves added for this partial schedule.  Index X has
173      a ps_insn id of X + g->num_nodes.  */
174   vec<ps_reg_move_info> reg_moves;
175 
176   /*  rows_length[i] holds the number of instructions in the row.
177       It is used only (as an optimization) to back off quickly from
178       trying to schedule a node in a full row; that is, to avoid running
179       through futile DFA state transitions.  */
180   int *rows_length;
181 
182   /* The earliest absolute cycle of an insn in the partial schedule.  */
183   int min_cycle;
184 
185   /* The latest absolute cycle of an insn in the partial schedule.  */
186   int max_cycle;
187 
188   ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
189 
190   int stage_count;  /* The stage count of the partial schedule.  */
191 };
192 
193 
194 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
195 static void free_partial_schedule (partial_schedule_ptr);
196 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
197 void print_partial_schedule (partial_schedule_ptr, FILE *);
198 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
199 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
200 						int, int, sbitmap, sbitmap);
201 static void rotate_partial_schedule (partial_schedule_ptr, int);
202 void set_row_column_for_ps (partial_schedule_ptr);
203 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
204 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
205 
206 
207 /* This page defines constants and structures for the modulo scheduling
208    driver.  */
209 
210 static int sms_order_nodes (ddg_ptr, int, int *, int *);
211 static void set_node_sched_params (ddg_ptr);
212 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
213 static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
214 static int calculate_stage_count (partial_schedule_ptr, int);
215 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
216 					   int, int, sbitmap, sbitmap, sbitmap);
217 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
218 			     sbitmap, int, int *, int *, int *);
219 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
220 					  sbitmap, int *, sbitmap, sbitmap);
221 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
222 
223 #define NODE_ASAP(node) ((node)->aux.count)
224 
225 #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
226 #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
227 #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
228 #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
229 #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
230 
231 /* The scheduling parameters held for each node.  */
232 typedef struct node_sched_params
233 {
234   int time;	/* The absolute scheduling cycle.  */
235 
236   int row;    /* Holds time % ii.  */
237   int stage;  /* Holds time / ii.  */
238 
239   /* The column of a node inside the ps.  If nodes u, v are on the same row,
240      u will precede v if column (u) < column (v).  */
241   int column;
242 } *node_sched_params_ptr;
243 
244 /* The following three functions are copied from the current scheduler
245    code in order to use sched_analyze() for computing the dependencies.
246    They are used when initializing the sched_info structure.  */
247 static const char *
sms_print_insn(const rtx_insn * insn,int aligned ATTRIBUTE_UNUSED)248 sms_print_insn (const rtx_insn *insn, int aligned ATTRIBUTE_UNUSED)
249 {
250   static char tmp[80];
251 
252   sprintf (tmp, "i%4d", INSN_UID (insn));
253   return tmp;
254 }
255 
256 static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)257 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
258 			       regset used ATTRIBUTE_UNUSED)
259 {
260 }
261 
262 static struct common_sched_info_def sms_common_sched_info;
263 
264 static struct sched_deps_info_def sms_sched_deps_info =
265   {
266     compute_jump_reg_dependencies,
267     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
268     NULL,
269     0, 0, 0
270   };
271 
272 static struct haifa_sched_info sms_sched_info =
273 {
274   NULL,
275   NULL,
276   NULL,
277   NULL,
278   NULL,
279   sms_print_insn,
280   NULL,
281   NULL, /* insn_finishes_block_p */
282   NULL, NULL,
283   NULL, NULL,
284   0, 0,
285 
286   NULL, NULL, NULL, NULL,
287   NULL, NULL,
288   0
289 };
290 
291 /* Partial schedule instruction ID in PS is a register move.  Return
292    information about it.  */
293 static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)294 ps_reg_move (partial_schedule_ptr ps, int id)
295 {
296   gcc_checking_assert (id >= ps->g->num_nodes);
297   return &ps->reg_moves[id - ps->g->num_nodes];
298 }
299 
300 /* Return the rtl instruction that is being scheduled by partial schedule
301    instruction ID, which belongs to schedule PS.  */
302 static rtx_insn *
ps_rtl_insn(partial_schedule_ptr ps,int id)303 ps_rtl_insn (partial_schedule_ptr ps, int id)
304 {
305   if (id < ps->g->num_nodes)
306     return ps->g->nodes[id].insn;
307   else
308     return ps_reg_move (ps, id)->insn;
309 }
310 
311 /* Partial schedule instruction ID, which belongs to PS, occurred in
312    the original (unscheduled) loop.  Return the first instruction
313    in the loop that was associated with ps_rtl_insn (PS, ID).
314    If the instruction had some notes before it, this is the first
315    of those notes.  */
316 static rtx_insn *
ps_first_note(partial_schedule_ptr ps,int id)317 ps_first_note (partial_schedule_ptr ps, int id)
318 {
319   gcc_assert (id < ps->g->num_nodes);
320   return ps->g->nodes[id].first_note;
321 }
322 
323 /* Return the number of consecutive stages that are occupied by
324    partial schedule instruction ID in PS.  */
325 static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)326 ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
327 {
328   if (id < ps->g->num_nodes)
329     return 1;
330   else
331     return ps_reg_move (ps, id)->num_consecutive_stages;
332 }
333 
334 /* Given HEAD and TAIL which are the first and last insns in a loop;
335    return the register which controls the loop.  Return zero if it has
336    more than one occurrence in the loop besides the control part or the
337    do-loop pattern is not of the form we expect.  */
338 static rtx
doloop_register_get(rtx_insn * head,rtx_insn * tail)339 doloop_register_get (rtx_insn *head, rtx_insn *tail)
340 {
341   rtx reg, condition;
342   rtx_insn *insn, *first_insn_not_to_check;
343 
344   if (!JUMP_P (tail))
345     return NULL_RTX;
346 
347   if (!targetm.code_for_doloop_end)
348     return NULL_RTX;
349 
350   /* TODO: Free SMS's dependence on doloop_condition_get.  */
351   condition = doloop_condition_get (tail);
352   if (! condition)
353     return NULL_RTX;
354 
355   if (REG_P (XEXP (condition, 0)))
356     reg = XEXP (condition, 0);
357   else if (GET_CODE (XEXP (condition, 0)) == PLUS
358 	   && REG_P (XEXP (XEXP (condition, 0), 0)))
359     reg = XEXP (XEXP (condition, 0), 0);
360   else
361     gcc_unreachable ();
362 
363   /* Check that the COUNT_REG has no other occurrences in the loop
364      until the decrement.  We assume the control part consists of
365      either a single (parallel) branch-on-count or a (non-parallel)
366      branch immediately preceded by a single (decrement) insn.  */
367   first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
368                              : prev_nondebug_insn (tail));
369 
370   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
371     if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
372       {
373         if (dump_file)
374         {
375           fprintf (dump_file, "SMS count_reg found ");
376           print_rtl_single (dump_file, reg);
377           fprintf (dump_file, " outside control in insn:\n");
378           print_rtl_single (dump_file, insn);
379         }
380 
381         return NULL_RTX;
382       }
383 
384   return reg;
385 }
386 
387 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
388    that the number of iterations is a compile-time constant.  If so,
389    return the rtx_insn that sets COUNT_REG to a constant, and set COUNT to
390    this constant.  Otherwise return 0.  */
391 static rtx_insn *
const_iteration_count(rtx count_reg,basic_block pre_header,int64_t * count,bool * adjust_inplace)392 const_iteration_count (rtx count_reg, basic_block pre_header,
393 		       int64_t *count, bool* adjust_inplace)
394 {
395   rtx_insn *insn;
396   rtx_insn *head, *tail;
397 
398   *adjust_inplace = false;
399   bool read_after = false;
400 
401   if (! pre_header)
402     return NULL;
403 
404   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
405 
406   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
407     if (single_set (insn) && rtx_equal_p (count_reg,
408 					  SET_DEST (single_set (insn))))
409       {
410 	rtx pat = single_set (insn);
411 
412 	if (CONST_INT_P (SET_SRC (pat)))
413 	  {
414 	    *count = INTVAL (SET_SRC (pat));
415 	    *adjust_inplace = !read_after;
416 	    return insn;
417 	  }
418 
419 	return NULL;
420       }
421     else if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (count_reg, insn))
422       {
423 	read_after = true;
424 	if (reg_set_p (count_reg, insn))
425 	   break;
426       }
427 
428   return NULL;
429 }
430 
431 /* A very simple resource-based lower bound on the initiation interval.
432    ??? Improve the accuracy of this bound by considering the
433    utilization of various units.  */
434 static int
res_MII(ddg_ptr g)435 res_MII (ddg_ptr g)
436 {
437   if (targetm.sched.sms_res_mii)
438     return targetm.sched.sms_res_mii (g);
439 
440   return g->num_nodes / issue_rate;
441 }
442 
443 
444 /* A vector that contains the sched data for each ps_insn.  */
445 static vec<node_sched_params> node_sched_param_vec;
446 
447 /* Allocate sched_params for each node and initialize it.  */
448 static void
set_node_sched_params(ddg_ptr g)449 set_node_sched_params (ddg_ptr g)
450 {
451   node_sched_param_vec.truncate (0);
452   node_sched_param_vec.safe_grow_cleared (g->num_nodes);
453 }
454 
455 /* Make sure that node_sched_param_vec has an entry for every move in PS.  */
456 static void
extend_node_sched_params(partial_schedule_ptr ps)457 extend_node_sched_params (partial_schedule_ptr ps)
458 {
459   node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
460 					  + ps->reg_moves.length ());
461 }
462 
463 /* Update the sched_params (time, row and stage) for node U using the II,
464    the CYCLE of U and MIN_CYCLE.
465    We're not simply taking the following
466    SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
467    because the stages may not be aligned on cycle 0.  */
468 static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)469 update_node_sched_params (int u, int ii, int cycle, int min_cycle)
470 {
471   int sc_until_cycle_zero;
472   int stage;
473 
474   SCHED_TIME (u) = cycle;
475   SCHED_ROW (u) = SMODULO (cycle, ii);
476 
477   /* The calculation of stage count is done adding the number
478      of stages before cycle zero and after cycle zero.  */
479   sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
480 
481   if (SCHED_TIME (u) < 0)
482     {
483       stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
484       SCHED_STAGE (u) = sc_until_cycle_zero - stage;
485     }
486   else
487     {
488       stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
489       SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
490     }
491 }
492 
493 static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)494 print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
495 {
496   int i;
497 
498   if (! file)
499     return;
500   for (i = 0; i < num_nodes; i++)
501     {
502       node_sched_params_ptr nsp = SCHED_PARAMS (i);
503 
504       fprintf (file, "Node = %d; INSN = %d\n", i,
505 	       INSN_UID (ps_rtl_insn (ps, i)));
506       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
507       fprintf (file, " time = %d:\n", nsp->time);
508       fprintf (file, " stage = %d:\n", nsp->stage);
509     }
510 }
511 
512 /* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
513 static void
set_columns_for_row(partial_schedule_ptr ps,int row)514 set_columns_for_row (partial_schedule_ptr ps, int row)
515 {
516   ps_insn_ptr cur_insn;
517   int column;
518 
519   column = 0;
520   for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
521     SCHED_COLUMN (cur_insn->id) = column++;
522 }
523 
524 /* Set SCHED_COLUMN for each instruction in PS.  */
525 static void
set_columns_for_ps(partial_schedule_ptr ps)526 set_columns_for_ps (partial_schedule_ptr ps)
527 {
528   int row;
529 
530   for (row = 0; row < ps->ii; row++)
531     set_columns_for_row (ps, row);
532 }
533 
534 /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
535    Its single predecessor has already been scheduled, as has its
536    ddg node successors.  (The move may have also another move as its
537    successor, in which case that successor will be scheduled later.)
538 
539    The move is part of a chain that satisfies register dependencies
540    between a producing ddg node and various consuming ddg nodes.
541    If some of these dependencies have a distance of 1 (meaning that
542    the use is upward-exposed) then DISTANCE1_USES is nonnull and
543    contains the set of uses with distance-1 dependencies.
544    DISTANCE1_USES is null otherwise.
545 
546    MUST_FOLLOW is a scratch bitmap that is big enough to hold
547    all current ps_insn ids.
548 
549    Return true on success.  */
550 static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)551 schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
552 		   sbitmap distance1_uses, sbitmap must_follow)
553 {
554   unsigned int u;
555   int this_time, this_distance, this_start, this_end, this_latency;
556   int start, end, c, ii;
557   sbitmap_iterator sbi;
558   ps_reg_move_info *move;
559   rtx_insn *this_insn;
560   ps_insn_ptr psi;
561 
562   move = ps_reg_move (ps, i_reg_move);
563   ii = ps->ii;
564   if (dump_file)
565     {
566       fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
567 	       ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
568 	       PS_MIN_CYCLE (ps));
569       print_rtl_single (dump_file, move->insn);
570       fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
571       fprintf (dump_file, "=========== =========== =====\n");
572     }
573 
574   start = INT_MIN;
575   end = INT_MAX;
576 
577   /* For dependencies of distance 1 between a producer ddg node A
578      and consumer ddg node B, we have a chain of dependencies:
579 
580         A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
581 
582      where Mi is the ith move.  For dependencies of distance 0 between
583      a producer ddg node A and consumer ddg node C, we have a chain of
584      dependencies:
585 
586         A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
587 
588      where Mi' occupies the same position as Mi but occurs a stage later.
589      We can only schedule each move once, so if we have both types of
590      chain, we model the second as:
591 
592         A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
593 
594      First handle the dependencies between the previously-scheduled
595      predecessor and the move.  */
596   this_insn = ps_rtl_insn (ps, move->def);
597   this_latency = insn_latency (this_insn, move->insn);
598   this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
599   this_time = SCHED_TIME (move->def) - this_distance * ii;
600   this_start = this_time + this_latency;
601   this_end = this_time + ii;
602   if (dump_file)
603     fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
604 	     this_start, this_end, SCHED_TIME (move->def),
605 	     INSN_UID (this_insn), this_latency, this_distance,
606 	     INSN_UID (move->insn));
607 
608   if (start < this_start)
609     start = this_start;
610   if (end > this_end)
611     end = this_end;
612 
613   /* Handle the dependencies between the move and previously-scheduled
614      successors.  */
615   EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
616     {
617       this_insn = ps_rtl_insn (ps, u);
618       this_latency = insn_latency (move->insn, this_insn);
619       if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
620 	this_distance = -1;
621       else
622 	this_distance = 0;
623       this_time = SCHED_TIME (u) + this_distance * ii;
624       this_start = this_time - ii;
625       this_end = this_time - this_latency;
626       if (dump_file)
627 	fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
628 		 this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
629 		 this_latency, this_distance, INSN_UID (this_insn));
630 
631       if (start < this_start)
632 	start = this_start;
633       if (end > this_end)
634 	end = this_end;
635     }
636 
637   if (dump_file)
638     {
639       fprintf (dump_file, "----------- ----------- -----\n");
640       fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
641     }
642 
643   bitmap_clear (must_follow);
644   bitmap_set_bit (must_follow, move->def);
645 
646   start = MAX (start, end - (ii - 1));
647   for (c = end; c >= start; c--)
648     {
649       psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
650 					 move->uses, must_follow);
651       if (psi)
652 	{
653 	  update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
654 	  if (dump_file)
655 	    fprintf (dump_file, "\nScheduled register move INSN %d at"
656 		     " time %d, row %d\n\n", INSN_UID (move->insn), c,
657 		     SCHED_ROW (i_reg_move));
658 	  return true;
659 	}
660     }
661 
662   if (dump_file)
663     fprintf (dump_file, "\nNo available slot\n\n");
664 
665   return false;
666 }
667 
668 /*
669    Breaking intra-loop register anti-dependences:
670    Each intra-loop register anti-dependence implies a cross-iteration true
671    dependence of distance 1. Therefore, we can remove such false dependencies
672    and figure out if the partial schedule broke them by checking if (for a
673    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
674    if so generate a register move.   The number of such moves is equal to:
675               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
676    nreg_moves = ----------------------------------- + 1 - {   dependence.
677                             ii                          { 1 if not.
678 */
679 static bool
schedule_reg_moves(partial_schedule_ptr ps)680 schedule_reg_moves (partial_schedule_ptr ps)
681 {
682   ddg_ptr g = ps->g;
683   int ii = ps->ii;
684   int i;
685 
686   for (i = 0; i < g->num_nodes; i++)
687     {
688       ddg_node_ptr u = &g->nodes[i];
689       ddg_edge_ptr e;
690       int nreg_moves = 0, i_reg_move;
691       rtx prev_reg, old_reg;
692       int first_move;
693       int distances[2];
694       sbitmap distance1_uses;
695       rtx set = single_set (u->insn);
696 
697       /* Skip instructions that do not set a register.  */
698       if (set && !REG_P (SET_DEST (set)))
699         continue;
700 
701       /* Compute the number of reg_moves needed for u, by looking at life
702 	 ranges started at u (excluding self-loops).  */
703       distances[0] = distances[1] = false;
704       for (e = u->out; e; e = e->next_out)
705 	if (e->type == TRUE_DEP && e->dest != e->src)
706 	  {
707 	    int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
708 				- SCHED_TIME (e->src->cuid)) / ii;
709 
710             if (e->distance == 1)
711               nreg_moves4e = (SCHED_TIME (e->dest->cuid)
712 			      - SCHED_TIME (e->src->cuid) + ii) / ii;
713 
714 	    /* If dest precedes src in the schedule of the kernel, then dest
715 	       will read before src writes and we can save one reg_copy.  */
716 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
717 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
718 	      nreg_moves4e--;
719 
720             if (nreg_moves4e >= 1)
721 	      {
722 		/* !single_set instructions are not supported yet and
723 		   thus we do not except to encounter them in the loop
724 		   except from the doloop part.  For the latter case
725 		   we assume no regmoves are generated as the doloop
726 		   instructions are tied to the branch with an edge.  */
727 		gcc_assert (set);
728 		/* If the instruction contains auto-inc register then
729 		   validate that the regmov is being generated for the
730 		   target regsiter rather then the inc'ed register.	*/
731 		gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
732 	      }
733 
734 	    if (nreg_moves4e)
735 	      {
736 		gcc_assert (e->distance < 2);
737 		distances[e->distance] = true;
738 	      }
739 	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
740 	  }
741 
742       if (nreg_moves == 0)
743 	continue;
744 
745       /* Create NREG_MOVES register moves.  */
746       first_move = ps->reg_moves.length ();
747       ps->reg_moves.safe_grow_cleared (first_move + nreg_moves);
748       extend_node_sched_params (ps);
749 
750       /* Record the moves associated with this node.  */
751       first_move += ps->g->num_nodes;
752 
753       /* Generate each move.  */
754       old_reg = prev_reg = SET_DEST (set);
755       if (HARD_REGISTER_P (old_reg))
756 	return false;
757 
758       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
759 	{
760 	  ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
761 
762 	  move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
763 	  move->uses = sbitmap_alloc (first_move + nreg_moves);
764 	  move->old_reg = old_reg;
765 	  move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
766 	  move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
767 	  move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
768 	  bitmap_clear (move->uses);
769 
770 	  prev_reg = move->new_reg;
771 	}
772 
773       distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
774 
775       if (distance1_uses)
776 	bitmap_clear (distance1_uses);
777 
778       /* Every use of the register defined by node may require a different
779 	 copy of this register, depending on the time the use is scheduled.
780 	 Record which uses require which move results.  */
781       for (e = u->out; e; e = e->next_out)
782 	if (e->type == TRUE_DEP && e->dest != e->src)
783 	  {
784 	    int dest_copy = (SCHED_TIME (e->dest->cuid)
785 			     - SCHED_TIME (e->src->cuid)) / ii;
786 
787 	    if (e->distance == 1)
788 	      dest_copy = (SCHED_TIME (e->dest->cuid)
789 			   - SCHED_TIME (e->src->cuid) + ii) / ii;
790 
791 	    if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
792 		&& SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
793 	      dest_copy--;
794 
795 	    if (dest_copy)
796 	      {
797 		ps_reg_move_info *move;
798 
799 		move = ps_reg_move (ps, first_move + dest_copy - 1);
800 		bitmap_set_bit (move->uses, e->dest->cuid);
801 		if (e->distance == 1)
802 		  bitmap_set_bit (distance1_uses, e->dest->cuid);
803 	      }
804 	  }
805 
806       auto_sbitmap must_follow (first_move + nreg_moves);
807       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
808 	if (!schedule_reg_move (ps, first_move + i_reg_move,
809 				distance1_uses, must_follow))
810 	  break;
811       if (distance1_uses)
812 	sbitmap_free (distance1_uses);
813       if (i_reg_move < nreg_moves)
814 	return false;
815     }
816   return true;
817 }
818 
819 /* Emit the moves associated with PS.  Apply the substitutions
820    associated with them.  */
821 static void
apply_reg_moves(partial_schedule_ptr ps)822 apply_reg_moves (partial_schedule_ptr ps)
823 {
824   ps_reg_move_info *move;
825   int i;
826 
827   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
828     {
829       unsigned int i_use;
830       sbitmap_iterator sbi;
831 
832       EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
833 	{
834 	  replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
835 	  df_insn_rescan (ps->g->nodes[i_use].insn);
836 	}
837     }
838 }
839 
840 /* Bump the SCHED_TIMEs of all nodes by AMOUNT.  Set the values of
841    SCHED_ROW and SCHED_STAGE.  Instruction scheduled on cycle AMOUNT
842    will move to cycle zero.  */
843 static void
reset_sched_times(partial_schedule_ptr ps,int amount)844 reset_sched_times (partial_schedule_ptr ps, int amount)
845 {
846   int row;
847   int ii = ps->ii;
848   ps_insn_ptr crr_insn;
849 
850   for (row = 0; row < ii; row++)
851     for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
852       {
853 	int u = crr_insn->id;
854 	int normalized_time = SCHED_TIME (u) - amount;
855 	int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
856 
857         if (dump_file)
858           {
859             /* Print the scheduling times after the rotation.  */
860 	    rtx_insn *insn = ps_rtl_insn (ps, u);
861 
862             fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
863                      "crr_insn->cycle=%d, min_cycle=%d", u,
864                      INSN_UID (insn), normalized_time, new_min_cycle);
865             if (JUMP_P (insn))
866               fprintf (dump_file, " (branch)");
867             fprintf (dump_file, "\n");
868           }
869 
870 	gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
871 	gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
872 
873 	crr_insn->cycle = normalized_time;
874 	update_node_sched_params (u, ii, normalized_time, new_min_cycle);
875       }
876 }
877 
878 /* Permute the insns according to their order in PS, from row 0 to
879    row ii-1, and position them right before LAST.  This schedules
880    the insns of the loop kernel.  */
881 static void
permute_partial_schedule(partial_schedule_ptr ps,rtx_insn * last)882 permute_partial_schedule (partial_schedule_ptr ps, rtx_insn *last)
883 {
884   int ii = ps->ii;
885   int row;
886   ps_insn_ptr ps_ij;
887 
888   for (row = 0; row < ii ; row++)
889     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
890       {
891 	rtx_insn *insn = ps_rtl_insn (ps, ps_ij->id);
892 
893 	if (PREV_INSN (last) != insn)
894 	  {
895 	    if (ps_ij->id < ps->g->num_nodes)
896 	      reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
897 				  PREV_INSN (last));
898 	    else
899 	      add_insn_before (insn, last, NULL);
900 	  }
901       }
902 }
903 
904 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
905    respectively only if cycle C falls on the border of the scheduling
906    window boundaries marked by START and END cycles.  STEP is the
907    direction of the window.  */
908 static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)909 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
910 			 sbitmap *tmp_precede, sbitmap must_precede, int c,
911 			 int start, int end, int step)
912 {
913   *tmp_precede = NULL;
914   *tmp_follow = NULL;
915 
916   if (c == start)
917     {
918       if (step == 1)
919 	*tmp_precede = must_precede;
920       else			/* step == -1.  */
921 	*tmp_follow = must_follow;
922     }
923   if (c == end - step)
924     {
925       if (step == 1)
926 	*tmp_follow = must_follow;
927       else			/* step == -1.  */
928 	*tmp_precede = must_precede;
929     }
930 
931 }
932 
933 /* Return True if the branch can be moved to row ii-1 while
934    normalizing the partial schedule PS to start from cycle zero and thus
935    optimize the SC.  Otherwise return False.  */
936 static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)937 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
938 {
939   int amount = PS_MIN_CYCLE (ps);
940   int start, end, step;
941   int ii = ps->ii;
942   bool ok = false;
943   int stage_count, stage_count_curr;
944 
945   /* Compare the SC after normalization and SC after bringing the branch
946      to row ii-1.  If they are equal just bail out.  */
947   stage_count = calculate_stage_count (ps, amount);
948   stage_count_curr =
949     calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
950 
951   if (stage_count == stage_count_curr)
952     {
953       if (dump_file)
954 	fprintf (dump_file, "SMS SC already optimized.\n");
955 
956       return false;
957     }
958 
959   if (dump_file)
960     {
961       fprintf (dump_file, "SMS Trying to optimize branch location\n");
962       fprintf (dump_file, "SMS partial schedule before trial:\n");
963       print_partial_schedule (ps, dump_file);
964     }
965 
966   /* First, normalize the partial scheduling.  */
967   reset_sched_times (ps, amount);
968   rotate_partial_schedule (ps, amount);
969   if (dump_file)
970     {
971       fprintf (dump_file,
972 	       "SMS partial schedule after normalization (ii, %d, SC %d):\n",
973 	       ii, stage_count);
974       print_partial_schedule (ps, dump_file);
975     }
976 
977   if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
978     return true;
979 
980   auto_sbitmap sched_nodes (g->num_nodes);
981   bitmap_ones (sched_nodes);
982 
983   /* Calculate the new placement of the branch.  It should be in row
984      ii-1 and fall into it's scheduling window.  */
985   if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
986 			&step, &end) == 0)
987     {
988       bool success;
989       ps_insn_ptr next_ps_i;
990       int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
991       int row = SMODULO (branch_cycle, ps->ii);
992       int num_splits = 0;
993       sbitmap tmp_precede, tmp_follow;
994       int min_cycle, c;
995 
996       if (dump_file)
997 	fprintf (dump_file, "\nTrying to schedule node %d "
998 		 "INSN = %d  in (%d .. %d) step %d\n",
999 		 g->closing_branch->cuid,
1000 		 (INSN_UID (g->closing_branch->insn)), start, end, step);
1001 
1002       gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
1003       if (step == 1)
1004 	{
1005 	  c = start + ii - SMODULO (start, ii) - 1;
1006 	  gcc_assert (c >= start);
1007 	  if (c >= end)
1008 	    {
1009 	      if (dump_file)
1010 		fprintf (dump_file,
1011 			 "SMS failed to schedule branch at cycle: %d\n", c);
1012 	      return false;
1013 	    }
1014 	}
1015       else
1016 	{
1017 	  c = start - SMODULO (start, ii) - 1;
1018 	  gcc_assert (c <= start);
1019 
1020 	  if (c <= end)
1021 	    {
1022 	      if (dump_file)
1023 		fprintf (dump_file,
1024 			 "SMS failed to schedule branch at cycle: %d\n", c);
1025 	      return false;
1026 	    }
1027 	}
1028 
1029       auto_sbitmap must_precede (g->num_nodes);
1030       auto_sbitmap must_follow (g->num_nodes);
1031 
1032       /* Try to schedule the branch is it's new cycle.  */
1033       calculate_must_precede_follow (g->closing_branch, start, end,
1034 				     step, ii, sched_nodes,
1035 				     must_precede, must_follow);
1036 
1037       set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1038 			       must_precede, c, start, end, step);
1039 
1040       /* Find the element in the partial schedule related to the closing
1041          branch so we can remove it from it's current cycle.  */
1042       for (next_ps_i = ps->rows[row];
1043 	   next_ps_i; next_ps_i = next_ps_i->next_in_row)
1044 	if (next_ps_i->id == g->closing_branch->cuid)
1045 	  break;
1046 
1047       min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1048       remove_node_from_ps (ps, next_ps_i);
1049       success =
1050 	try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1051 				      sched_nodes, &num_splits,
1052 				      tmp_precede, tmp_follow);
1053       gcc_assert (num_splits == 0);
1054       if (!success)
1055 	{
1056 	  if (dump_file)
1057 	    fprintf (dump_file,
1058 		     "SMS failed to schedule branch at cycle: %d, "
1059 		     "bringing it back to cycle %d\n", c, branch_cycle);
1060 
1061 	  /* The branch was failed to be placed in row ii - 1.
1062 	     Put it back in it's original place in the partial
1063 	     schedualing.  */
1064 	  set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1065 				   must_precede, branch_cycle, start, end,
1066 				   step);
1067 	  success =
1068 	    try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1069 					  branch_cycle, sched_nodes,
1070 					  &num_splits, tmp_precede,
1071 					  tmp_follow);
1072 	  gcc_assert (success && (num_splits == 0));
1073 	  ok = false;
1074 	}
1075       else
1076 	{
1077 	  /* The branch is placed in row ii - 1.  */
1078 	  if (dump_file)
1079 	    fprintf (dump_file,
1080 		     "SMS success in moving branch to cycle %d\n", c);
1081 
1082 	  update_node_sched_params (g->closing_branch->cuid, ii, c,
1083 				    PS_MIN_CYCLE (ps));
1084 	  ok = true;
1085 	}
1086 
1087       /* This might have been added to a new first stage.  */
1088       if (PS_MIN_CYCLE (ps) < min_cycle)
1089 	reset_sched_times (ps, 0);
1090     }
1091 
1092   return ok;
1093 }
1094 
1095 static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg)1096 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1097 			   int to_stage, rtx count_reg)
1098 {
1099   int row;
1100   ps_insn_ptr ps_ij;
1101 
1102   for (row = 0; row < ps->ii; row++)
1103     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1104       {
1105 	int u = ps_ij->id;
1106 	int first_u, last_u;
1107 	rtx_insn *u_insn;
1108 
1109         /* Do not duplicate any insn which refers to count_reg as it
1110            belongs to the control part.
1111            The closing branch is scheduled as well and thus should
1112            be ignored.
1113            TODO: This should be done by analyzing the control part of
1114            the loop.  */
1115 	u_insn = ps_rtl_insn (ps, u);
1116         if (reg_mentioned_p (count_reg, u_insn)
1117             || JUMP_P (u_insn))
1118           continue;
1119 
1120 	first_u = SCHED_STAGE (u);
1121 	last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1122 	if (from_stage <= last_u && to_stage >= first_u)
1123 	  {
1124 	    if (u < ps->g->num_nodes)
1125 	      duplicate_insn_chain (ps_first_note (ps, u), u_insn);
1126 	    else
1127 	      emit_insn (copy_rtx (PATTERN (u_insn)));
1128 	  }
1129       }
1130 }
1131 
1132 
1133 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
1134 static void
generate_prolog_epilog(partial_schedule_ptr ps,class loop * loop,rtx count_reg,bool adjust_init)1135 generate_prolog_epilog (partial_schedule_ptr ps, class loop *loop,
1136 			rtx count_reg, bool adjust_init)
1137 {
1138   int i;
1139   int last_stage = PS_STAGE_COUNT (ps) - 1;
1140   edge e;
1141 
1142   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
1143   start_sequence ();
1144 
1145   if (adjust_init)
1146     {
1147       /* Generate instructions at the beginning of the prolog to
1148 	 adjust the loop count by STAGE_COUNT.  If loop count is constant
1149 	 and it not used anywhere in prologue, this constant is adjusted by
1150 	 STAGE_COUNT outside of generate_prolog_epilog function.  */
1151       rtx sub_reg = NULL_RTX;
1152 
1153       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
1154 				     gen_int_mode (last_stage,
1155 						   GET_MODE (count_reg)),
1156                                      count_reg, 1, OPTAB_DIRECT);
1157       gcc_assert (REG_P (sub_reg));
1158       if (REGNO (sub_reg) != REGNO (count_reg))
1159         emit_move_insn (count_reg, sub_reg);
1160     }
1161 
1162   for (i = 0; i < last_stage; i++)
1163     duplicate_insns_of_cycles (ps, 0, i, count_reg);
1164 
1165   /* Put the prolog on the entry edge.  */
1166   e = loop_preheader_edge (loop);
1167   split_edge_and_insert (e, get_insns ());
1168   if (!flag_resched_modulo_sched)
1169     e->dest->flags |= BB_DISABLE_SCHEDULE;
1170 
1171   end_sequence ();
1172 
1173   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
1174   start_sequence ();
1175 
1176   for (i = 0; i < last_stage; i++)
1177     duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
1178 
1179   /* Put the epilogue on the exit edge.  */
1180   gcc_assert (single_exit (loop));
1181   e = single_exit (loop);
1182   split_edge_and_insert (e, get_insns ());
1183   if (!flag_resched_modulo_sched)
1184     e->dest->flags |= BB_DISABLE_SCHEDULE;
1185 
1186   end_sequence ();
1187 }
1188 
1189 /* Mark LOOP as software pipelined so the later
1190    scheduling passes don't touch it.  */
1191 static void
mark_loop_unsched(class loop * loop)1192 mark_loop_unsched (class loop *loop)
1193 {
1194   unsigned i;
1195   basic_block *bbs = get_loop_body (loop);
1196 
1197   for (i = 0; i < loop->num_nodes; i++)
1198     bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1199 
1200   free (bbs);
1201 }
1202 
1203 /* Return true if all the BBs of the loop are empty except the
1204    loop header.  */
1205 static bool
loop_single_full_bb_p(class loop * loop)1206 loop_single_full_bb_p (class loop *loop)
1207 {
1208   unsigned i;
1209   basic_block *bbs = get_loop_body (loop);
1210 
1211   for (i = 0; i < loop->num_nodes ; i++)
1212     {
1213       rtx_insn *head, *tail;
1214       bool empty_bb = true;
1215 
1216       if (bbs[i] == loop->header)
1217         continue;
1218 
1219       /* Make sure that basic blocks other than the header
1220          have only notes labels or jumps.  */
1221       get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1222       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1223         {
1224           if (NOTE_P (head) || LABEL_P (head)
1225  	      || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1226  	    continue;
1227  	  empty_bb = false;
1228  	  break;
1229         }
1230 
1231       if (! empty_bb)
1232         {
1233           free (bbs);
1234           return false;
1235         }
1236     }
1237   free (bbs);
1238   return true;
1239 }
1240 
1241 /* Dump file:line from INSN's location info to dump_file.  */
1242 
1243 static void
dump_insn_location(rtx_insn * insn)1244 dump_insn_location (rtx_insn *insn)
1245 {
1246   if (dump_file && INSN_HAS_LOCATION (insn))
1247     {
1248       expanded_location xloc = insn_location (insn);
1249       fprintf (dump_file, " %s:%i", xloc.file, xloc.line);
1250     }
1251 }
1252 
1253 /* A simple loop from SMS point of view; it is a loop that is composed of
1254    either a single basic block or two BBs - a header and a latch.  */
1255 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
1256 				  && (EDGE_COUNT (loop->latch->preds) == 1) \
1257                                   && (EDGE_COUNT (loop->latch->succs) == 1))
1258 
1259 /* Return true if the loop is in its canonical form and false if not.
1260    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
1261 static bool
loop_canon_p(class loop * loop)1262 loop_canon_p (class loop *loop)
1263 {
1264 
1265   if (loop->inner || !loop_outer (loop))
1266   {
1267     if (dump_file)
1268       fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1269     return false;
1270   }
1271 
1272   if (!single_exit (loop))
1273     {
1274       if (dump_file)
1275 	{
1276 	  rtx_insn *insn = BB_END (loop->header);
1277 
1278 	  fprintf (dump_file, "SMS loop many exits");
1279 	  dump_insn_location (insn);
1280 	  fprintf (dump_file, "\n");
1281 	}
1282       return false;
1283     }
1284 
1285   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1286     {
1287       if (dump_file)
1288 	{
1289 	  rtx_insn *insn = BB_END (loop->header);
1290 
1291 	  fprintf (dump_file, "SMS loop many BBs.");
1292 	  dump_insn_location (insn);
1293 	  fprintf (dump_file, "\n");
1294 	}
1295       return false;
1296     }
1297 
1298     return true;
1299 }
1300 
1301 /* If there are more than one entry for the loop,
1302    make it one by splitting the first entry edge and
1303    redirecting the others to the new BB.  */
1304 static void
canon_loop(class loop * loop)1305 canon_loop (class loop *loop)
1306 {
1307   edge e;
1308   edge_iterator i;
1309 
1310   /* Avoid annoying special cases of edges going to exit
1311      block.  */
1312   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
1313     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1314       split_edge (e);
1315 
1316   if (loop->latch == loop->header
1317       || EDGE_COUNT (loop->latch->succs) > 1)
1318     {
1319       FOR_EACH_EDGE (e, i, loop->header->preds)
1320         if (e->src == loop->latch)
1321           break;
1322       split_edge (e);
1323     }
1324 }
1325 
1326 /* Setup infos.  */
1327 static void
setup_sched_infos(void)1328 setup_sched_infos (void)
1329 {
1330   memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1331 	  sizeof (sms_common_sched_info));
1332   sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1333   common_sched_info = &sms_common_sched_info;
1334 
1335   sched_deps_info = &sms_sched_deps_info;
1336   current_sched_info = &sms_sched_info;
1337 }
1338 
1339 /* Probability in % that the sms-ed loop rolls enough so that optimized
1340    version may be entered.  Just a guess.  */
1341 #define PROB_SMS_ENOUGH_ITERATIONS 80
1342 
1343 /* Main entry point, perform SMS scheduling on the loops of the function
1344    that consist of single basic blocks.  */
1345 static void
sms_schedule(void)1346 sms_schedule (void)
1347 {
1348   rtx_insn *insn;
1349   ddg_ptr *g_arr, g;
1350   int * node_order;
1351   int maxii, max_asap;
1352   partial_schedule_ptr ps;
1353   basic_block bb = NULL;
1354   class loop *loop;
1355   basic_block condition_bb = NULL;
1356   edge latch_edge;
1357   HOST_WIDE_INT trip_count, max_trip_count;
1358   HARD_REG_SET prohibited_regs;
1359 
1360   loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1361 		       | LOOPS_HAVE_RECORDED_EXITS);
1362   if (number_of_loops (cfun) <= 1)
1363     {
1364       loop_optimizer_finalize ();
1365       return;  /* There are no loops to schedule.  */
1366     }
1367 
1368   /* Initialize issue_rate.  */
1369   if (targetm.sched.issue_rate)
1370     {
1371       int temp = reload_completed;
1372 
1373       reload_completed = 1;
1374       issue_rate = targetm.sched.issue_rate ();
1375       reload_completed = temp;
1376     }
1377   else
1378     issue_rate = 1;
1379 
1380   /* Initialize the scheduler.  */
1381   setup_sched_infos ();
1382   haifa_sched_init ();
1383 
1384   /* Allocate memory to hold the DDG array one entry for each loop.
1385      We use loop->num as index into this array.  */
1386   g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun));
1387 
1388   REG_SET_TO_HARD_REG_SET (prohibited_regs, &df->regular_block_artificial_uses);
1389 
1390   if (dump_file)
1391   {
1392     fprintf (dump_file, "\n\nSMS analysis phase\n");
1393     fprintf (dump_file, "===================\n\n");
1394   }
1395 
1396   /* Build DDGs for all the relevant loops and hold them in G_ARR
1397      indexed by the loop index.  */
1398   FOR_EACH_LOOP (loop, 0)
1399     {
1400       rtx_insn *head, *tail;
1401       rtx count_reg;
1402 
1403       /* For debugging.  */
1404       if (dbg_cnt (sms_sched_loop) == false)
1405         {
1406           if (dump_file)
1407             fprintf (dump_file, "SMS reached max limit... \n");
1408 
1409 	  break;
1410         }
1411 
1412       if (dump_file)
1413 	{
1414 	  rtx_insn *insn = BB_END (loop->header);
1415 
1416 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1417 	  dump_insn_location (insn);
1418 	  fprintf (dump_file, "\n");
1419 	}
1420 
1421       if (! loop_canon_p (loop))
1422         continue;
1423 
1424       if (! loop_single_full_bb_p (loop))
1425       {
1426         if (dump_file)
1427           fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1428 	continue;
1429       }
1430 
1431       bb = loop->header;
1432 
1433       get_ebb_head_tail (bb, bb, &head, &tail);
1434       latch_edge = loop_latch_edge (loop);
1435       gcc_assert (single_exit (loop));
1436       trip_count = get_estimated_loop_iterations_int (loop);
1437       max_trip_count = get_max_loop_iterations_int (loop);
1438 
1439       /* Perform SMS only on loops that their average count is above threshold.  */
1440 
1441       if ( latch_edge->count () > profile_count::zero ()
1442           && (latch_edge->count()
1443 	      < single_exit (loop)->count ().apply_scale
1444 				 (param_sms_loop_average_count_threshold, 1)))
1445 	{
1446 	  if (dump_file)
1447 	    {
1448 	      dump_insn_location (tail);
1449 	      fprintf (dump_file, "\nSMS single-bb-loop\n");
1450 	      if (profile_info && flag_branch_probabilities)
1451 	    	{
1452 	      	  fprintf (dump_file, "SMS loop-count ");
1453 	      	  fprintf (dump_file, "%" PRId64,
1454 	             	   (int64_t) bb->count.to_gcov_type ());
1455 	      	  fprintf (dump_file, "\n");
1456                   fprintf (dump_file, "SMS trip-count ");
1457                   fprintf (dump_file, "%" PRId64 "max %" PRId64,
1458                            (int64_t) trip_count, (int64_t) max_trip_count);
1459                   fprintf (dump_file, "\n");
1460 	    	}
1461 	    }
1462           continue;
1463         }
1464 
1465       /* Make sure this is a doloop.  */
1466       if ( !(count_reg = doloop_register_get (head, tail)))
1467       {
1468         if (dump_file)
1469           fprintf (dump_file, "SMS doloop_register_get failed\n");
1470 	continue;
1471       }
1472 
1473       /* Don't handle BBs with calls or barriers
1474 	 or !single_set with the exception of do-loop control part insns.
1475          ??? Should handle insns defining subregs.  */
1476       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1477 	{
1478 	  if (INSN_P (insn))
1479 	    {
1480 	      HARD_REG_SET regs;
1481 	      CLEAR_HARD_REG_SET (regs);
1482 	      note_stores (insn, record_hard_reg_sets, &regs);
1483 	      if (hard_reg_set_intersect_p (regs, prohibited_regs))
1484 		break;
1485 	    }
1486 
1487 	  if (CALL_P (insn)
1488 	      || BARRIER_P (insn)
1489 	      || (INSN_P (insn) && single_set (insn)
1490 		  && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1491 	      /* Not a single set.  */
1492 	      || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1493 		  && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1494 		  /* But non-single-set allowed in one special case.  */
1495 		  && (insn != prev_nondebug_insn (tail)
1496 		      || !reg_mentioned_p (count_reg, insn))))
1497 	    break;
1498 	}
1499 
1500       if (insn != NEXT_INSN (tail))
1501 	{
1502 	  if (dump_file)
1503 	    {
1504 	      if (CALL_P (insn))
1505 		fprintf (dump_file, "SMS loop-with-call\n");
1506 	      else if (BARRIER_P (insn))
1507 		fprintf (dump_file, "SMS loop-with-barrier\n");
1508 	      else if (INSN_P (insn) && single_set (insn)
1509 		       && GET_CODE (SET_DEST (single_set (insn))) == SUBREG)
1510 		fprintf (dump_file, "SMS loop with subreg in lhs\n");
1511 	      else
1512 		fprintf (dump_file,
1513 			 "SMS loop-with-not-single-set-or-prohibited-reg\n");
1514 
1515 	      print_rtl_single (dump_file, insn);
1516 	    }
1517 
1518 	  continue;
1519 	}
1520 
1521       /* Always schedule the closing branch with the rest of the
1522          instructions. The branch is rotated to be in row ii-1 at the
1523          end of the scheduling procedure to make sure it's the last
1524          instruction in the iteration.  */
1525       if (! (g = create_ddg (bb, 1)))
1526         {
1527           if (dump_file)
1528 	    fprintf (dump_file, "SMS create_ddg failed\n");
1529 	  continue;
1530         }
1531 
1532       g_arr[loop->num] = g;
1533       if (dump_file)
1534         fprintf (dump_file, "...OK\n");
1535 
1536     }
1537   if (dump_file)
1538   {
1539     fprintf (dump_file, "\nSMS transformation phase\n");
1540     fprintf (dump_file, "=========================\n\n");
1541   }
1542 
1543   /* We don't want to perform SMS on new loops - created by versioning.  */
1544   FOR_EACH_LOOP (loop, 0)
1545     {
1546       rtx_insn *head, *tail;
1547       rtx count_reg;
1548       rtx_insn *count_init;
1549       int mii, rec_mii, stage_count, min_cycle;
1550       int64_t loop_count = 0;
1551       bool opt_sc_p, adjust_inplace = false;
1552       basic_block pre_header;
1553 
1554       if (! (g = g_arr[loop->num]))
1555         continue;
1556 
1557       if (dump_file)
1558 	{
1559 	  rtx_insn *insn = BB_END (loop->header);
1560 
1561 	  fprintf (dump_file, "SMS loop num: %d", loop->num);
1562 	  dump_insn_location (insn);
1563 	  fprintf (dump_file, "\n");
1564 
1565 	  print_ddg (dump_file, g);
1566 	}
1567 
1568       get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1569 
1570       latch_edge = loop_latch_edge (loop);
1571       gcc_assert (single_exit (loop));
1572       trip_count = get_estimated_loop_iterations_int (loop);
1573       max_trip_count = get_max_loop_iterations_int (loop);
1574 
1575       if (dump_file)
1576 	{
1577 	  dump_insn_location (tail);
1578 	  fprintf (dump_file, "\nSMS single-bb-loop\n");
1579 	  if (profile_info && flag_branch_probabilities)
1580 	    {
1581 	      fprintf (dump_file, "SMS loop-count ");
1582 	      fprintf (dump_file, "%" PRId64,
1583 	               (int64_t) bb->count.to_gcov_type ());
1584 	      fprintf (dump_file, "\n");
1585 	    }
1586 	  fprintf (dump_file, "SMS doloop\n");
1587 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1588           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1589           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1590 	}
1591 
1592 
1593       count_reg = doloop_register_get (head, tail);
1594       gcc_assert (count_reg);
1595 
1596       pre_header = loop_preheader_edge (loop)->src;
1597       count_init = const_iteration_count (count_reg, pre_header, &loop_count,
1598 					  &adjust_inplace);
1599 
1600       if (dump_file && count_init)
1601         {
1602           fprintf (dump_file, "SMS const-doloop ");
1603           fprintf (dump_file, "%" PRId64,
1604 		     loop_count);
1605           fprintf (dump_file, "\n");
1606         }
1607 
1608       node_order = XNEWVEC (int, g->num_nodes);
1609 
1610       mii = 1; /* Need to pass some estimate of mii.  */
1611       rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1612       mii = MAX (res_MII (g), rec_mii);
1613       mii = MAX (mii, 1);
1614       maxii = MAX (max_asap, param_sms_max_ii_factor * mii);
1615 
1616       if (dump_file)
1617 	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1618 		 rec_mii, mii, maxii);
1619 
1620       for (;;)
1621 	{
1622 	  set_node_sched_params (g);
1623 
1624 	  stage_count = 0;
1625 	  opt_sc_p = false;
1626 	  ps = sms_schedule_by_order (g, mii, maxii, node_order);
1627 
1628 	  if (ps)
1629 	    {
1630 	      /* Try to achieve optimized SC by normalizing the partial
1631 		 schedule (having the cycles start from cycle zero).
1632 		 The branch location must be placed in row ii-1 in the
1633 		 final scheduling.	If failed, shift all instructions to
1634 		 position the branch in row ii-1.  */
1635 	      opt_sc_p = optimize_sc (ps, g);
1636 	      if (opt_sc_p)
1637 		stage_count = calculate_stage_count (ps, 0);
1638 	      else
1639 		{
1640 		  /* Bring the branch to cycle ii-1.  */
1641 		  int amount = (SCHED_TIME (g->closing_branch->cuid)
1642 				- (ps->ii - 1));
1643 
1644 		  if (dump_file)
1645 		    fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1646 
1647 		  stage_count = calculate_stage_count (ps, amount);
1648 		}
1649 
1650 	      gcc_assert (stage_count >= 1);
1651 	    }
1652 
1653 	  /* The default value of param_sms_min_sc is 2 as stage count of
1654 	     1 means that there is no interleaving between iterations thus
1655 	     we let the scheduling passes do the job in this case.  */
1656 	  if (stage_count < param_sms_min_sc
1657 	      || (count_init && (loop_count <= stage_count))
1658 	      || (max_trip_count >= 0 && max_trip_count <= stage_count)
1659 	      || (trip_count >= 0 && trip_count <= stage_count))
1660 	    {
1661 	      if (dump_file)
1662 		{
1663 		  fprintf (dump_file, "SMS failed... \n");
1664 		  fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1665 			   " loop-count=", stage_count);
1666 		  fprintf (dump_file, "%" PRId64, loop_count);
1667 		  fprintf (dump_file, ", trip-count=");
1668 		  fprintf (dump_file, "%" PRId64 "max %" PRId64,
1669 			   (int64_t) trip_count, (int64_t) max_trip_count);
1670 		  fprintf (dump_file, ")\n");
1671 		}
1672 	      break;
1673 	    }
1674 
1675           if (!opt_sc_p)
1676             {
1677 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
1678               int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1679 
1680               reset_sched_times (ps, amount);
1681               rotate_partial_schedule (ps, amount);
1682             }
1683 
1684 	  set_columns_for_ps (ps);
1685 
1686 	  min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1687 	  if (!schedule_reg_moves (ps))
1688 	    {
1689 	      mii = ps->ii + 1;
1690 	      free_partial_schedule (ps);
1691 	      continue;
1692 	    }
1693 
1694 	  /* Moves that handle incoming values might have been added
1695 	     to a new first stage.  Bump the stage count if so.
1696 
1697 	     ??? Perhaps we could consider rotating the schedule here
1698 	     instead?  */
1699 	  if (PS_MIN_CYCLE (ps) < min_cycle)
1700 	    {
1701 	      reset_sched_times (ps, 0);
1702 	      stage_count++;
1703 	    }
1704 
1705 	  /* The stage count should now be correct without rotation.  */
1706 	  gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1707 	  PS_STAGE_COUNT (ps) = stage_count;
1708 
1709 	  canon_loop (loop);
1710 
1711           if (dump_file)
1712             {
1713 	      dump_insn_location (tail);
1714 	      fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1715 		       ps->ii, stage_count);
1716 	      print_partial_schedule (ps, dump_file);
1717 	    }
1718 
1719 	  if (count_init)
1720 	    {
1721 	       if (adjust_inplace)
1722 		{
1723 		  /* When possible, set new iteration count of loop kernel in
1724 		     place.  Otherwise, generate_prolog_epilog creates an insn
1725 		     to adjust.  */
1726 		  SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1727 							    - stage_count + 1);
1728 		}
1729 	    }
1730 	  else
1731             {
1732 	      /* case the BCT count is not known , Do loop-versioning */
1733 	      rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
1734 					 gen_int_mode (stage_count,
1735 						       GET_MODE (count_reg)));
1736 	      profile_probability prob = profile_probability::guessed_always ()
1737 				.apply_scale (PROB_SMS_ENOUGH_ITERATIONS, 100);
1738 
1739 	      loop_version (loop, comp_rtx, &condition_bb,
1740 	  		    prob, prob.invert (),
1741 			    prob, prob.invert (), true);
1742 	    }
1743 
1744 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
1745 	  permute_partial_schedule (ps, g->closing_branch->first_note);
1746 
1747           /* Mark this loop as software pipelined so the later
1748 	     scheduling passes don't touch it.  */
1749 	  if (! flag_resched_modulo_sched)
1750 	    mark_loop_unsched (loop);
1751 
1752 	  /* The life-info is not valid any more.  */
1753 	  df_set_bb_dirty (g->bb);
1754 
1755 	  apply_reg_moves (ps);
1756 	  if (dump_file)
1757 	    print_node_sched_params (dump_file, g->num_nodes, ps);
1758 	  /* Generate prolog and epilog.  */
1759 	  generate_prolog_epilog (ps, loop, count_reg, !adjust_inplace);
1760 	  break;
1761 	}
1762 
1763       free_partial_schedule (ps);
1764       node_sched_param_vec.release ();
1765       free (node_order);
1766       free_ddg (g);
1767     }
1768 
1769   free (g_arr);
1770 
1771   /* Release scheduler data, needed until now because of DFA.  */
1772   haifa_sched_finish ();
1773   loop_optimizer_finalize ();
1774 }
1775 
1776 /* The SMS scheduling algorithm itself
1777    -----------------------------------
1778    Input: 'O' an ordered list of insns of a loop.
1779    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1780 
1781    'Q' is the empty Set
1782    'PS' is the partial schedule; it holds the currently scheduled nodes with
1783 	their cycle/slot.
1784    'PSP' previously scheduled predecessors.
1785    'PSS' previously scheduled successors.
1786    't(u)' the cycle where u is scheduled.
1787    'l(u)' is the latency of u.
1788    'd(v,u)' is the dependence distance from v to u.
1789    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1790 	     the node ordering phase.
1791    'check_hardware_resources_conflicts(u, PS, c)'
1792 			     run a trace around cycle/slot through DFA model
1793 			     to check resource conflicts involving instruction u
1794 			     at cycle c given the partial schedule PS.
1795    'add_to_partial_schedule_at_time(u, PS, c)'
1796 			     Add the node/instruction u to the partial schedule
1797 			     PS at time c.
1798    'calculate_register_pressure(PS)'
1799 			     Given a schedule of instructions, calculate the register
1800 			     pressure it implies.  One implementation could be the
1801 			     maximum number of overlapping live ranges.
1802    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1803 	   registers available in the hardware.
1804 
1805    1. II = MII.
1806    2. PS = empty list
1807    3. for each node u in O in pre-computed order
1808    4.   if (PSP(u) != Q && PSS(u) == Q) then
1809    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1810    6.     start = Early_start; end = Early_start + II - 1; step = 1
1811    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1812    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1813    13.     start = Late_start; end = Late_start - II + 1; step = -1
1814    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1815    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1816    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1817    17.     start = Early_start;
1818    18.     end = min(Early_start + II - 1 , Late_start);
1819    19.     step = 1
1820    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1821    21.	  start = ASAP(u); end = start + II - 1; step = 1
1822    22.  endif
1823 
1824    23.  success = false
1825    24.  for (c = start ; c != end ; c += step)
1826    25.     if check_hardware_resources_conflicts(u, PS, c) then
1827    26.       add_to_partial_schedule_at_time(u, PS, c)
1828    27.       success = true
1829    28.       break
1830    29.     endif
1831    30.  endfor
1832    31.  if (success == false) then
1833    32.    II = II + 1
1834    33.    if (II > maxII) then
1835    34.       finish - failed to schedule
1836    35.	 endif
1837    36.    goto 2.
1838    37.  endif
1839    38. endfor
1840    39. if (calculate_register_pressure(PS) > maxRP) then
1841    40.    goto 32.
1842    41. endif
1843    42. compute epilogue & prologue
1844    43. finish - succeeded to schedule
1845 
1846    ??? The algorithm restricts the scheduling window to II cycles.
1847    In rare cases, it may be better to allow windows of II+1 cycles.
1848    The window would then start and end on the same row, but with
1849    different "must precede" and "must follow" requirements.  */
1850 
1851 /* A threshold for the number of repeated unsuccessful attempts to insert
1852    an empty row, before we flush the partial schedule and start over.  */
1853 #define MAX_SPLIT_NUM 10
1854 /* Given the partial schedule PS, this function calculates and returns the
1855    cycles in which we can schedule the node with the given index I.
1856    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1857    noticed that there are several cases in which we fail    to SMS the loop
1858    because the sched window of a node is empty    due to tight data-deps. In
1859    such cases we want to unschedule    some of the predecessors/successors
1860    until we get non-empty    scheduling window.  It returns -1 if the
1861    scheduling window is empty and zero otherwise.  */
1862 
1863 static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1864 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1865 		  sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1866 		  int *end_p)
1867 {
1868   int start, step, end;
1869   int early_start, late_start;
1870   ddg_edge_ptr e;
1871   auto_sbitmap psp (ps->g->num_nodes);
1872   auto_sbitmap pss (ps->g->num_nodes);
1873   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1874   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1875   int psp_not_empty;
1876   int pss_not_empty;
1877   int count_preds;
1878   int count_succs;
1879 
1880   /* 1. compute sched window for u (start, end, step).  */
1881   bitmap_clear (psp);
1882   bitmap_clear (pss);
1883   psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1884   pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1885 
1886   /* We first compute a forward range (start <= end), then decide whether
1887      to reverse it.  */
1888   early_start = INT_MIN;
1889   late_start = INT_MAX;
1890   start = INT_MIN;
1891   end = INT_MAX;
1892   step = 1;
1893 
1894   count_preds = 0;
1895   count_succs = 0;
1896 
1897   if (dump_file && (psp_not_empty || pss_not_empty))
1898     {
1899       fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1900 	       "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1901       fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1902 	       "start", "early start", "late start", "end", "time");
1903       fprintf (dump_file, "=========== =========== =========== ==========="
1904 	       " =====\n");
1905     }
1906   /* Calculate early_start and limit end.  Both bounds are inclusive.  */
1907   if (psp_not_empty)
1908     for (e = u_node->in; e != 0; e = e->next_in)
1909       {
1910 	int v = e->src->cuid;
1911 
1912 	if (bitmap_bit_p (sched_nodes, v))
1913 	  {
1914 	    int p_st = SCHED_TIME (v);
1915 	    int earliest = p_st + e->latency - (e->distance * ii);
1916 	    int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1917 
1918 	    if (dump_file)
1919 	      {
1920 		fprintf (dump_file, "%11s %11d %11s %11d %5d",
1921 			 "", earliest, "", latest, p_st);
1922 		print_ddg_edge (dump_file, e);
1923 		fprintf (dump_file, "\n");
1924 	      }
1925 
1926 	    early_start = MAX (early_start, earliest);
1927 	    end = MIN (end, latest);
1928 
1929 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1930 	      count_preds++;
1931 	  }
1932       }
1933 
1934   /* Calculate late_start and limit start.  Both bounds are inclusive.  */
1935   if (pss_not_empty)
1936     for (e = u_node->out; e != 0; e = e->next_out)
1937       {
1938 	int v = e->dest->cuid;
1939 
1940 	if (bitmap_bit_p (sched_nodes, v))
1941 	  {
1942 	    int s_st = SCHED_TIME (v);
1943 	    int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1944 	    int latest = s_st - e->latency + (e->distance * ii);
1945 
1946 	    if (dump_file)
1947 	      {
1948 		fprintf (dump_file, "%11d %11s %11d %11s %5d",
1949 			 earliest, "", latest, "", s_st);
1950 		print_ddg_edge (dump_file, e);
1951 		fprintf (dump_file, "\n");
1952 	      }
1953 
1954 	    start = MAX (start, earliest);
1955 	    late_start = MIN (late_start, latest);
1956 
1957 	    if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1958 	      count_succs++;
1959 	  }
1960       }
1961 
1962   if (dump_file && (psp_not_empty || pss_not_empty))
1963     {
1964       fprintf (dump_file, "----------- ----------- ----------- -----------"
1965 	       " -----\n");
1966       fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1967 	       start, early_start, late_start, end, "",
1968 	       "(max, max, min, min)");
1969     }
1970 
1971   /* Get a target scheduling window no bigger than ii.  */
1972   if (early_start == INT_MIN && late_start == INT_MAX)
1973     early_start = NODE_ASAP (u_node);
1974   else if (early_start == INT_MIN)
1975     early_start = late_start - (ii - 1);
1976   late_start = MIN (late_start, early_start + (ii - 1));
1977 
1978   /* Apply memory dependence limits.  */
1979   start = MAX (start, early_start);
1980   end = MIN (end, late_start);
1981 
1982   if (dump_file && (psp_not_empty || pss_not_empty))
1983     fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1984 	     "", start, end, "", "");
1985 
1986   /* If there are at least as many successors as predecessors, schedule the
1987      node close to its successors.  */
1988   if (pss_not_empty && count_succs >= count_preds)
1989     {
1990       std::swap (start, end);
1991       step = -1;
1992     }
1993 
1994   /* Now that we've finalized the window, make END an exclusive rather
1995      than an inclusive bound.  */
1996   end += step;
1997 
1998   *start_p = start;
1999   *step_p = step;
2000   *end_p = end;
2001 
2002   if ((start >= end && step == 1) || (start <= end && step == -1))
2003     {
2004       if (dump_file)
2005 	fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
2006 		 start, end, step);
2007       return -1;
2008     }
2009 
2010   return 0;
2011 }
2012 
2013 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2014    node currently been scheduled.  At the end of the calculation
2015    MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2016    U_NODE which are (1) already scheduled in the first/last row of
2017    U_NODE's scheduling window, (2) whose dependence inequality with U
2018    becomes an equality when U is scheduled in this same row, and (3)
2019    whose dependence latency is zero.
2020 
2021    The first and last rows are calculated using the following parameters:
2022    START/END rows - The cycles that begins/ends the traversal on the window;
2023    searching for an empty cycle to schedule U_NODE.
2024    STEP - The direction in which we traverse the window.
2025    II - The initiation interval.  */
2026 
2027 static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2028 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2029 			       int step, int ii, sbitmap sched_nodes,
2030 			       sbitmap must_precede, sbitmap must_follow)
2031 {
2032   ddg_edge_ptr e;
2033   int first_cycle_in_window, last_cycle_in_window;
2034 
2035   gcc_assert (must_precede && must_follow);
2036 
2037   /* Consider the following scheduling window:
2038      {first_cycle_in_window, first_cycle_in_window+1, ...,
2039      last_cycle_in_window}.  If step is 1 then the following will be
2040      the order we traverse the window: {start=first_cycle_in_window,
2041      first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2042      or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2043      end=first_cycle_in_window-1} if step is -1.  */
2044   first_cycle_in_window = (step == 1) ? start : end - step;
2045   last_cycle_in_window = (step == 1) ? end - step : start;
2046 
2047   bitmap_clear (must_precede);
2048   bitmap_clear (must_follow);
2049 
2050   if (dump_file)
2051     fprintf (dump_file, "\nmust_precede: ");
2052 
2053   /* Instead of checking if:
2054       (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2055       && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2056              first_cycle_in_window)
2057       && e->latency == 0
2058      we use the fact that latency is non-negative:
2059       SCHED_TIME (e->src) - (e->distance * ii) <=
2060       SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2061       first_cycle_in_window
2062      and check only if
2063       SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window  */
2064   for (e = u_node->in; e != 0; e = e->next_in)
2065     if (bitmap_bit_p (sched_nodes, e->src->cuid)
2066 	&& ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2067              first_cycle_in_window))
2068       {
2069 	if (dump_file)
2070 	  fprintf (dump_file, "%d ", e->src->cuid);
2071 
2072 	bitmap_set_bit (must_precede, e->src->cuid);
2073       }
2074 
2075   if (dump_file)
2076     fprintf (dump_file, "\nmust_follow: ");
2077 
2078   /* Instead of checking if:
2079       (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2080       && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2081              last_cycle_in_window)
2082       && e->latency == 0
2083      we use the fact that latency is non-negative:
2084       SCHED_TIME (e->dest) + (e->distance * ii) >=
2085       SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2086       last_cycle_in_window
2087      and check only if
2088       SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window  */
2089   for (e = u_node->out; e != 0; e = e->next_out)
2090     if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2091 	&& ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2092              last_cycle_in_window))
2093       {
2094 	if (dump_file)
2095 	  fprintf (dump_file, "%d ", e->dest->cuid);
2096 
2097 	bitmap_set_bit (must_follow, e->dest->cuid);
2098       }
2099 
2100   if (dump_file)
2101     fprintf (dump_file, "\n");
2102 }
2103 
2104 /* Return 1 if U_NODE can be scheduled in CYCLE.  Use the following
2105    parameters to decide if that's possible:
2106    PS - The partial schedule.
2107    U - The serial number of U_NODE.
2108    NUM_SPLITS - The number of row splits made so far.
2109    MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2110    the first row of the scheduling window)
2111    MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2112    last row of the scheduling window)  */
2113 
2114 static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2115 try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2116 			      int u, int cycle, sbitmap sched_nodes,
2117 			      int *num_splits, sbitmap must_precede,
2118 			      sbitmap must_follow)
2119 {
2120   ps_insn_ptr psi;
2121   bool success = 0;
2122 
2123   verify_partial_schedule (ps, sched_nodes);
2124   psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2125   if (psi)
2126     {
2127       SCHED_TIME (u) = cycle;
2128       bitmap_set_bit (sched_nodes, u);
2129       success = 1;
2130       *num_splits = 0;
2131       if (dump_file)
2132 	fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2133 
2134     }
2135 
2136   return success;
2137 }
2138 
2139 /* This function implements the scheduling algorithm for SMS according to the
2140    above algorithm.  */
2141 static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2142 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2143 {
2144   int ii = mii;
2145   int i, c, success, num_splits = 0;
2146   int flush_and_start_over = true;
2147   int num_nodes = g->num_nodes;
2148   int start, end, step; /* Place together into one struct?  */
2149   auto_sbitmap sched_nodes (num_nodes);
2150   auto_sbitmap must_precede (num_nodes);
2151   auto_sbitmap must_follow (num_nodes);
2152   auto_sbitmap tobe_scheduled (num_nodes);
2153 
2154   /* Value of param_sms_dfa_history is a limit on the number of cycles that
2155      resource conflicts can span.  ??? Should be provided by DFA, and be
2156      dependent on the type of insn scheduled.  Set to 0 by default to save
2157      compile time.  */
2158   partial_schedule_ptr ps = create_partial_schedule (ii, g,
2159 						     param_sms_dfa_history);
2160 
2161   bitmap_ones (tobe_scheduled);
2162   bitmap_clear (sched_nodes);
2163 
2164   while (flush_and_start_over && (ii < maxii))
2165     {
2166 
2167       if (dump_file)
2168 	fprintf (dump_file, "Starting with ii=%d\n", ii);
2169       flush_and_start_over = false;
2170       bitmap_clear (sched_nodes);
2171 
2172       for (i = 0; i < num_nodes; i++)
2173 	{
2174 	  int u = nodes_order[i];
2175   	  ddg_node_ptr u_node = &ps->g->nodes[u];
2176 	  rtx_insn *insn = u_node->insn;
2177 
2178 	  gcc_checking_assert (NONDEBUG_INSN_P (insn));
2179 
2180 	  if (bitmap_bit_p (sched_nodes, u))
2181 	    continue;
2182 
2183 	  /* Try to get non-empty scheduling window.  */
2184 	 success = 0;
2185          if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2186                                 &step, &end) == 0)
2187             {
2188               if (dump_file)
2189                 fprintf (dump_file, "\nTrying to schedule node %d "
2190 			 "INSN = %d  in (%d .. %d) step %d\n", u, (INSN_UID
2191                         (g->nodes[u].insn)), start, end, step);
2192 
2193               gcc_assert ((step > 0 && start < end)
2194                           || (step < 0 && start > end));
2195 
2196               calculate_must_precede_follow (u_node, start, end, step, ii,
2197                                              sched_nodes, must_precede,
2198                                              must_follow);
2199 
2200               for (c = start; c != end; c += step)
2201                 {
2202 		  sbitmap tmp_precede, tmp_follow;
2203 
2204                   set_must_precede_follow (&tmp_follow, must_follow,
2205 		                           &tmp_precede, must_precede,
2206                                            c, start, end, step);
2207                   success =
2208                     try_scheduling_node_in_cycle (ps, u, c,
2209                                                   sched_nodes,
2210                                                   &num_splits, tmp_precede,
2211                                                   tmp_follow);
2212                   if (success)
2213                     break;
2214                 }
2215 
2216               verify_partial_schedule (ps, sched_nodes);
2217             }
2218             if (!success)
2219             {
2220               int split_row;
2221 
2222               if (ii++ == maxii)
2223                 break;
2224 
2225               if (num_splits >= MAX_SPLIT_NUM)
2226                 {
2227                   num_splits = 0;
2228                   flush_and_start_over = true;
2229                   verify_partial_schedule (ps, sched_nodes);
2230                   reset_partial_schedule (ps, ii);
2231                   verify_partial_schedule (ps, sched_nodes);
2232                   break;
2233                 }
2234 
2235               num_splits++;
2236               /* The scheduling window is exclusive of 'end'
2237                  whereas compute_split_window() expects an inclusive,
2238                  ordered range.  */
2239               if (step == 1)
2240                 split_row = compute_split_row (sched_nodes, start, end - 1,
2241                                                ps->ii, u_node);
2242               else
2243                 split_row = compute_split_row (sched_nodes, end + 1, start,
2244                                                ps->ii, u_node);
2245 
2246               ps_insert_empty_row (ps, split_row, sched_nodes);
2247               i--;              /* Go back and retry node i.  */
2248 
2249               if (dump_file)
2250                 fprintf (dump_file, "num_splits=%d\n", num_splits);
2251             }
2252 
2253           /* ??? If (success), check register pressure estimates.  */
2254         }                       /* Continue with next node.  */
2255     }                           /* While flush_and_start_over.  */
2256   if (ii >= maxii)
2257     {
2258       free_partial_schedule (ps);
2259       ps = NULL;
2260     }
2261   else
2262     gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2263 
2264   return ps;
2265 }
2266 
2267 /* This function inserts a new empty row into PS at the position
2268    according to SPLITROW, keeping all already scheduled instructions
2269    intact and updating their SCHED_TIME and cycle accordingly.  */
2270 static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2271 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2272 		     sbitmap sched_nodes)
2273 {
2274   ps_insn_ptr crr_insn;
2275   ps_insn_ptr *rows_new;
2276   int ii = ps->ii;
2277   int new_ii = ii + 1;
2278   int row;
2279   int *rows_length_new;
2280 
2281   verify_partial_schedule (ps, sched_nodes);
2282 
2283   /* We normalize sched_time and rotate ps to have only non-negative sched
2284      times, for simplicity of updating cycles after inserting new row.  */
2285   split_row -= ps->min_cycle;
2286   split_row = SMODULO (split_row, ii);
2287   if (dump_file)
2288     fprintf (dump_file, "split_row=%d\n", split_row);
2289 
2290   reset_sched_times (ps, PS_MIN_CYCLE (ps));
2291   rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2292 
2293   rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2294   rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2295   for (row = 0; row < split_row; row++)
2296     {
2297       rows_new[row] = ps->rows[row];
2298       rows_length_new[row] = ps->rows_length[row];
2299       ps->rows[row] = NULL;
2300       for (crr_insn = rows_new[row];
2301 	   crr_insn; crr_insn = crr_insn->next_in_row)
2302 	{
2303 	  int u = crr_insn->id;
2304 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2305 
2306 	  SCHED_TIME (u) = new_time;
2307 	  crr_insn->cycle = new_time;
2308 	  SCHED_ROW (u) = new_time % new_ii;
2309 	  SCHED_STAGE (u) = new_time / new_ii;
2310 	}
2311 
2312     }
2313 
2314   rows_new[split_row] = NULL;
2315 
2316   for (row = split_row; row < ii; row++)
2317     {
2318       rows_new[row + 1] = ps->rows[row];
2319       rows_length_new[row + 1] = ps->rows_length[row];
2320       ps->rows[row] = NULL;
2321       for (crr_insn = rows_new[row + 1];
2322 	   crr_insn; crr_insn = crr_insn->next_in_row)
2323 	{
2324 	  int u = crr_insn->id;
2325 	  int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2326 
2327 	  SCHED_TIME (u) = new_time;
2328 	  crr_insn->cycle = new_time;
2329 	  SCHED_ROW (u) = new_time % new_ii;
2330 	  SCHED_STAGE (u) = new_time / new_ii;
2331 	}
2332     }
2333 
2334   /* Updating ps.  */
2335   ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2336     + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2337   ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2338     + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2339   free (ps->rows);
2340   ps->rows = rows_new;
2341   free (ps->rows_length);
2342   ps->rows_length = rows_length_new;
2343   ps->ii = new_ii;
2344   gcc_assert (ps->min_cycle >= 0);
2345 
2346   verify_partial_schedule (ps, sched_nodes);
2347 
2348   if (dump_file)
2349     fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2350 	     ps->max_cycle);
2351 }
2352 
2353 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2354    UP which are the boundaries of it's scheduling window; compute using
2355    SCHED_NODES and II a row in the partial schedule that can be split
2356    which will separate a critical predecessor from a critical successor
2357    thereby expanding the window, and return it.  */
2358 static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2359 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2360 		   ddg_node_ptr u_node)
2361 {
2362   ddg_edge_ptr e;
2363   int lower = INT_MIN, upper = INT_MAX;
2364   int crit_pred = -1;
2365   int crit_succ = -1;
2366   int crit_cycle;
2367 
2368   for (e = u_node->in; e != 0; e = e->next_in)
2369     {
2370       int v = e->src->cuid;
2371 
2372       if (bitmap_bit_p (sched_nodes, v)
2373 	  && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2374 	if (SCHED_TIME (v) > lower)
2375 	  {
2376 	    crit_pred = v;
2377 	    lower = SCHED_TIME (v);
2378 	  }
2379     }
2380 
2381   if (crit_pred >= 0)
2382     {
2383       crit_cycle = SCHED_TIME (crit_pred) + 1;
2384       return SMODULO (crit_cycle, ii);
2385     }
2386 
2387   for (e = u_node->out; e != 0; e = e->next_out)
2388     {
2389       int v = e->dest->cuid;
2390 
2391       if (bitmap_bit_p (sched_nodes, v)
2392 	  && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2393 	if (SCHED_TIME (v) < upper)
2394 	  {
2395 	    crit_succ = v;
2396 	    upper = SCHED_TIME (v);
2397 	  }
2398     }
2399 
2400   if (crit_succ >= 0)
2401     {
2402       crit_cycle = SCHED_TIME (crit_succ);
2403       return SMODULO (crit_cycle, ii);
2404     }
2405 
2406   if (dump_file)
2407     fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2408 
2409   return SMODULO ((low + up + 1) / 2, ii);
2410 }
2411 
2412 static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2413 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2414 {
2415   int row;
2416   ps_insn_ptr crr_insn;
2417 
2418   for (row = 0; row < ps->ii; row++)
2419     {
2420       int length = 0;
2421 
2422       for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2423 	{
2424 	  int u = crr_insn->id;
2425 
2426 	  length++;
2427 	  gcc_assert (bitmap_bit_p (sched_nodes, u));
2428 	  /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2429 	     popcount (sched_nodes) == number of insns in ps.  */
2430 	  gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2431 	  gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2432 	}
2433 
2434       gcc_assert (ps->rows_length[row] == length);
2435     }
2436 }
2437 
2438 
2439 /* This page implements the algorithm for ordering the nodes of a DDG
2440    for modulo scheduling, activated through the
2441    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
2442 
2443 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2444 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2445 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2446 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2447 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2448 #define DEPTH(x) (ASAP ((x)))
2449 
2450 typedef struct node_order_params * nopa;
2451 
2452 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2453 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2454 static nopa  calculate_order_params (ddg_ptr, int, int *);
2455 static int find_max_asap (ddg_ptr, sbitmap);
2456 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2457 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2458 
2459 enum sms_direction {BOTTOMUP, TOPDOWN};
2460 
2461 struct node_order_params
2462 {
2463   int asap;
2464   int alap;
2465   int height;
2466 };
2467 
2468 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
2469 static void
check_nodes_order(int * node_order,int num_nodes)2470 check_nodes_order (int *node_order, int num_nodes)
2471 {
2472   int i;
2473   auto_sbitmap tmp (num_nodes);
2474 
2475   bitmap_clear (tmp);
2476 
2477   if (dump_file)
2478     fprintf (dump_file, "SMS final nodes order: \n");
2479 
2480   for (i = 0; i < num_nodes; i++)
2481     {
2482       int u = node_order[i];
2483 
2484       if (dump_file)
2485         fprintf (dump_file, "%d ", u);
2486       gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2487 
2488       bitmap_set_bit (tmp, u);
2489     }
2490 
2491   if (dump_file)
2492     fprintf (dump_file, "\n");
2493 }
2494 
2495 /* Order the nodes of G for scheduling and pass the result in
2496    NODE_ORDER.  Also set aux.count of each node to ASAP.
2497    Put maximal ASAP to PMAX_ASAP.  Return the recMII for the given DDG.  */
2498 static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2499 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2500 {
2501   int i;
2502   int rec_mii = 0;
2503   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2504 
2505   nopa nops = calculate_order_params (g, mii, pmax_asap);
2506 
2507   if (dump_file)
2508     print_sccs (dump_file, sccs, g);
2509 
2510   order_nodes_of_sccs (sccs, node_order);
2511 
2512   if (sccs->num_sccs > 0)
2513     /* First SCC has the largest recurrence_length.  */
2514     rec_mii = sccs->sccs[0]->recurrence_length;
2515 
2516   /* Save ASAP before destroying node_order_params.  */
2517   for (i = 0; i < g->num_nodes; i++)
2518     {
2519       ddg_node_ptr v = &g->nodes[i];
2520       v->aux.count = ASAP (v);
2521     }
2522 
2523   free (nops);
2524   free_ddg_all_sccs (sccs);
2525   check_nodes_order (node_order, g->num_nodes);
2526 
2527   return rec_mii;
2528 }
2529 
2530 static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2531 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2532 {
2533   int i, pos = 0;
2534   ddg_ptr g = all_sccs->ddg;
2535   int num_nodes = g->num_nodes;
2536   auto_sbitmap prev_sccs (num_nodes);
2537   auto_sbitmap on_path (num_nodes);
2538   auto_sbitmap tmp (num_nodes);
2539   auto_sbitmap ones (num_nodes);
2540 
2541   bitmap_clear (prev_sccs);
2542   bitmap_ones (ones);
2543 
2544   /* Perform the node ordering starting from the SCC with the highest recMII.
2545      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
2546   for (i = 0; i < all_sccs->num_sccs; i++)
2547     {
2548       ddg_scc_ptr scc = all_sccs->sccs[i];
2549 
2550       /* Add nodes on paths from previous SCCs to the current SCC.  */
2551       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2552       bitmap_ior (tmp, scc->nodes, on_path);
2553 
2554       /* Add nodes on paths from the current SCC to previous SCCs.  */
2555       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2556       bitmap_ior (tmp, tmp, on_path);
2557 
2558       /* Remove nodes of previous SCCs from current extended SCC.  */
2559       bitmap_and_compl (tmp, tmp, prev_sccs);
2560 
2561       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2562       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
2563     }
2564 
2565   /* Handle the remaining nodes that do not belong to any scc.  Each call
2566      to order_nodes_in_scc handles a single connected component.  */
2567   while (pos < g->num_nodes)
2568     {
2569       bitmap_and_compl (tmp, ones, prev_sccs);
2570       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2571     }
2572 }
2573 
2574 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
2575 static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2576 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2577 {
2578   int u;
2579   int max_asap;
2580   int num_nodes = g->num_nodes;
2581   ddg_edge_ptr e;
2582   /* Allocate a place to hold ordering params for each node in the DDG.  */
2583   nopa node_order_params_arr;
2584 
2585   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
2586   node_order_params_arr = (nopa) xcalloc (num_nodes,
2587 					  sizeof (struct node_order_params));
2588 
2589   /* Set the aux pointer of each node to point to its order_params structure.  */
2590   for (u = 0; u < num_nodes; u++)
2591     g->nodes[u].aux.info = &node_order_params_arr[u];
2592 
2593   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2594      calculate ASAP, ALAP, mobility, distance, and height for each node
2595      in the dependence (direct acyclic) graph.  */
2596 
2597   /* We assume that the nodes in the array are in topological order.  */
2598 
2599   max_asap = 0;
2600   for (u = 0; u < num_nodes; u++)
2601     {
2602       ddg_node_ptr u_node = &g->nodes[u];
2603 
2604       ASAP (u_node) = 0;
2605       for (e = u_node->in; e; e = e->next_in)
2606 	if (e->distance == 0)
2607 	  ASAP (u_node) = MAX (ASAP (u_node),
2608 			       ASAP (e->src) + e->latency);
2609       max_asap = MAX (max_asap, ASAP (u_node));
2610     }
2611 
2612   for (u = num_nodes - 1; u > -1; u--)
2613     {
2614       ddg_node_ptr u_node = &g->nodes[u];
2615 
2616       ALAP (u_node) = max_asap;
2617       HEIGHT (u_node) = 0;
2618       for (e = u_node->out; e; e = e->next_out)
2619 	if (e->distance == 0)
2620 	  {
2621 	    ALAP (u_node) = MIN (ALAP (u_node),
2622 				 ALAP (e->dest) - e->latency);
2623 	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
2624 				   HEIGHT (e->dest) + e->latency);
2625 	  }
2626     }
2627   if (dump_file)
2628   {
2629     fprintf (dump_file, "\nOrder params\n");
2630     for (u = 0; u < num_nodes; u++)
2631       {
2632         ddg_node_ptr u_node = &g->nodes[u];
2633 
2634         fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2635                  ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2636       }
2637   }
2638 
2639   *pmax_asap = max_asap;
2640   return node_order_params_arr;
2641 }
2642 
2643 static int
find_max_asap(ddg_ptr g,sbitmap nodes)2644 find_max_asap (ddg_ptr g, sbitmap nodes)
2645 {
2646   unsigned int u = 0;
2647   int max_asap = -1;
2648   int result = -1;
2649   sbitmap_iterator sbi;
2650 
2651   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2652     {
2653       ddg_node_ptr u_node = &g->nodes[u];
2654 
2655       if (max_asap < ASAP (u_node))
2656 	{
2657 	  max_asap = ASAP (u_node);
2658 	  result = u;
2659 	}
2660     }
2661   return result;
2662 }
2663 
2664 static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2665 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2666 {
2667   unsigned int u = 0;
2668   int max_hv = -1;
2669   int min_mob = INT_MAX;
2670   int result = -1;
2671   sbitmap_iterator sbi;
2672 
2673   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2674     {
2675       ddg_node_ptr u_node = &g->nodes[u];
2676 
2677       if (max_hv < HEIGHT (u_node))
2678 	{
2679 	  max_hv = HEIGHT (u_node);
2680 	  min_mob = MOB (u_node);
2681 	  result = u;
2682 	}
2683       else if ((max_hv == HEIGHT (u_node))
2684 	       && (min_mob > MOB (u_node)))
2685 	{
2686 	  min_mob = MOB (u_node);
2687 	  result = u;
2688 	}
2689     }
2690   return result;
2691 }
2692 
2693 static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2694 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2695 {
2696   unsigned int u = 0;
2697   int max_dv = -1;
2698   int min_mob = INT_MAX;
2699   int result = -1;
2700   sbitmap_iterator sbi;
2701 
2702   EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2703     {
2704       ddg_node_ptr u_node = &g->nodes[u];
2705 
2706       if (max_dv < DEPTH (u_node))
2707 	{
2708 	  max_dv = DEPTH (u_node);
2709 	  min_mob = MOB (u_node);
2710 	  result = u;
2711 	}
2712       else if ((max_dv == DEPTH (u_node))
2713 	       && (min_mob > MOB (u_node)))
2714 	{
2715 	  min_mob = MOB (u_node);
2716 	  result = u;
2717 	}
2718     }
2719   return result;
2720 }
2721 
2722 /* Places the nodes of SCC into the NODE_ORDER array starting
2723    at position POS, according to the SMS ordering algorithm.
2724    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2725    the NODE_ORDER array, starting from position zero.  */
2726 static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2727 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2728 		    int * node_order, int pos)
2729 {
2730   enum sms_direction dir;
2731   int num_nodes = g->num_nodes;
2732   auto_sbitmap workset (num_nodes);
2733   auto_sbitmap tmp (num_nodes);
2734   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2735   auto_sbitmap predecessors (num_nodes);
2736   auto_sbitmap successors (num_nodes);
2737 
2738   bitmap_clear (predecessors);
2739   find_predecessors (predecessors, g, nodes_ordered);
2740 
2741   bitmap_clear (successors);
2742   find_successors (successors, g, nodes_ordered);
2743 
2744   bitmap_clear (tmp);
2745   if (bitmap_and (tmp, predecessors, scc))
2746     {
2747       bitmap_copy (workset, tmp);
2748       dir = BOTTOMUP;
2749     }
2750   else if (bitmap_and (tmp, successors, scc))
2751     {
2752       bitmap_copy (workset, tmp);
2753       dir = TOPDOWN;
2754     }
2755   else
2756     {
2757       int u;
2758 
2759       bitmap_clear (workset);
2760       if ((u = find_max_asap (g, scc)) >= 0)
2761 	bitmap_set_bit (workset, u);
2762       dir = BOTTOMUP;
2763     }
2764 
2765   bitmap_clear (zero_bitmap);
2766   while (!bitmap_equal_p (workset, zero_bitmap))
2767     {
2768       int v;
2769       ddg_node_ptr v_node;
2770       sbitmap v_node_preds;
2771       sbitmap v_node_succs;
2772 
2773       if (dir == TOPDOWN)
2774 	{
2775 	  while (!bitmap_equal_p (workset, zero_bitmap))
2776 	    {
2777 	      v = find_max_hv_min_mob (g, workset);
2778 	      v_node = &g->nodes[v];
2779 	      node_order[pos++] = v;
2780 	      v_node_succs = NODE_SUCCESSORS (v_node);
2781 	      bitmap_and (tmp, v_node_succs, scc);
2782 
2783 	      /* Don't consider the already ordered successors again.  */
2784 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2785 	      bitmap_ior (workset, workset, tmp);
2786 	      bitmap_clear_bit (workset, v);
2787 	      bitmap_set_bit (nodes_ordered, v);
2788 	    }
2789 	  dir = BOTTOMUP;
2790 	  bitmap_clear (predecessors);
2791 	  find_predecessors (predecessors, g, nodes_ordered);
2792 	  bitmap_and (workset, predecessors, scc);
2793 	}
2794       else
2795 	{
2796 	  while (!bitmap_equal_p (workset, zero_bitmap))
2797 	    {
2798 	      v = find_max_dv_min_mob (g, workset);
2799 	      v_node = &g->nodes[v];
2800 	      node_order[pos++] = v;
2801 	      v_node_preds = NODE_PREDECESSORS (v_node);
2802 	      bitmap_and (tmp, v_node_preds, scc);
2803 
2804 	      /* Don't consider the already ordered predecessors again.  */
2805 	      bitmap_and_compl (tmp, tmp, nodes_ordered);
2806 	      bitmap_ior (workset, workset, tmp);
2807 	      bitmap_clear_bit (workset, v);
2808 	      bitmap_set_bit (nodes_ordered, v);
2809 	    }
2810 	  dir = TOPDOWN;
2811 	  bitmap_clear (successors);
2812 	  find_successors (successors, g, nodes_ordered);
2813 	  bitmap_and (workset, successors, scc);
2814 	}
2815     }
2816   sbitmap_free (zero_bitmap);
2817   return pos;
2818 }
2819 
2820 
2821 /* This page contains functions for manipulating partial-schedules during
2822    modulo scheduling.  */
2823 
2824 /* Create a partial schedule and allocate a memory to hold II rows.  */
2825 
2826 static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2827 create_partial_schedule (int ii, ddg_ptr g, int history)
2828 {
2829   partial_schedule_ptr ps = XNEW (struct partial_schedule);
2830   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2831   ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2832   ps->reg_moves.create (0);
2833   ps->ii = ii;
2834   ps->history = history;
2835   ps->min_cycle = INT_MAX;
2836   ps->max_cycle = INT_MIN;
2837   ps->g = g;
2838 
2839   return ps;
2840 }
2841 
2842 /* Free the PS_INSNs in rows array of the given partial schedule.
2843    ??? Consider caching the PS_INSN's.  */
2844 static void
free_ps_insns(partial_schedule_ptr ps)2845 free_ps_insns (partial_schedule_ptr ps)
2846 {
2847   int i;
2848 
2849   for (i = 0; i < ps->ii; i++)
2850     {
2851       while (ps->rows[i])
2852 	{
2853 	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2854 
2855 	  free (ps->rows[i]);
2856 	  ps->rows[i] = ps_insn;
2857 	}
2858       ps->rows[i] = NULL;
2859     }
2860 }
2861 
2862 /* Free all the memory allocated to the partial schedule.  */
2863 
2864 static void
free_partial_schedule(partial_schedule_ptr ps)2865 free_partial_schedule (partial_schedule_ptr ps)
2866 {
2867   ps_reg_move_info *move;
2868   unsigned int i;
2869 
2870   if (!ps)
2871     return;
2872 
2873   FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2874     sbitmap_free (move->uses);
2875   ps->reg_moves.release ();
2876 
2877   free_ps_insns (ps);
2878   free (ps->rows);
2879   free (ps->rows_length);
2880   free (ps);
2881 }
2882 
2883 /* Clear the rows array with its PS_INSNs, and create a new one with
2884    NEW_II rows.  */
2885 
2886 static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2887 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2888 {
2889   if (!ps)
2890     return;
2891   free_ps_insns (ps);
2892   if (new_ii == ps->ii)
2893     return;
2894   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2895 						 * sizeof (ps_insn_ptr));
2896   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2897   ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2898   memset (ps->rows_length, 0, new_ii * sizeof (int));
2899   ps->ii = new_ii;
2900   ps->min_cycle = INT_MAX;
2901   ps->max_cycle = INT_MIN;
2902 }
2903 
2904 /* Prints the partial schedule as an ii rows array, for each rows
2905    print the ids of the insns in it.  */
2906 void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2907 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2908 {
2909   int i;
2910 
2911   for (i = 0; i < ps->ii; i++)
2912     {
2913       ps_insn_ptr ps_i = ps->rows[i];
2914 
2915       fprintf (dump, "\n[ROW %d ]: ", i);
2916       while (ps_i)
2917 	{
2918 	  rtx_insn *insn = ps_rtl_insn (ps, ps_i->id);
2919 
2920 	  if (JUMP_P (insn))
2921 	    fprintf (dump, "%d (branch), ", INSN_UID (insn));
2922 	  else
2923 	    fprintf (dump, "%d, ", INSN_UID (insn));
2924 
2925 	  ps_i = ps_i->next_in_row;
2926 	}
2927     }
2928 }
2929 
2930 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2931 static ps_insn_ptr
create_ps_insn(int id,int cycle)2932 create_ps_insn (int id, int cycle)
2933 {
2934   ps_insn_ptr ps_i = XNEW (struct ps_insn);
2935 
2936   ps_i->id = id;
2937   ps_i->next_in_row = NULL;
2938   ps_i->prev_in_row = NULL;
2939   ps_i->cycle = cycle;
2940 
2941   return ps_i;
2942 }
2943 
2944 
2945 /* Removes the given PS_INSN from the partial schedule.  */
2946 static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2947 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2948 {
2949   int row;
2950 
2951   gcc_assert (ps && ps_i);
2952 
2953   row = SMODULO (ps_i->cycle, ps->ii);
2954   if (! ps_i->prev_in_row)
2955     {
2956       gcc_assert (ps_i == ps->rows[row]);
2957       ps->rows[row] = ps_i->next_in_row;
2958       if (ps->rows[row])
2959 	ps->rows[row]->prev_in_row = NULL;
2960     }
2961   else
2962     {
2963       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2964       if (ps_i->next_in_row)
2965 	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2966     }
2967 
2968   ps->rows_length[row] -= 1;
2969   free (ps_i);
2970   return;
2971 }
2972 
2973 /* Unlike what literature describes for modulo scheduling (which focuses
2974    on VLIW machines) the order of the instructions inside a cycle is
2975    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2976    where the current instruction should go relative to the already
2977    scheduled instructions in the given cycle.  Go over these
2978    instructions and find the first possible column to put it in.  */
2979 static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2980 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2981 		     sbitmap must_precede, sbitmap must_follow)
2982 {
2983   ps_insn_ptr next_ps_i;
2984   ps_insn_ptr first_must_follow = NULL;
2985   ps_insn_ptr last_must_precede = NULL;
2986   ps_insn_ptr last_in_row = NULL;
2987   int row;
2988 
2989   if (! ps_i)
2990     return false;
2991 
2992   row = SMODULO (ps_i->cycle, ps->ii);
2993 
2994   /* Find the first must follow and the last must precede
2995      and insert the node immediately after the must precede
2996      but make sure that it there is no must follow after it.  */
2997   for (next_ps_i = ps->rows[row];
2998        next_ps_i;
2999        next_ps_i = next_ps_i->next_in_row)
3000     {
3001       if (must_follow
3002 	  && bitmap_bit_p (must_follow, next_ps_i->id)
3003 	  && ! first_must_follow)
3004         first_must_follow = next_ps_i;
3005       if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
3006         {
3007           /* If we have already met a node that must follow, then
3008 	     there is no possible column.  */
3009   	  if (first_must_follow)
3010             return false;
3011 	  else
3012             last_must_precede = next_ps_i;
3013         }
3014       /* The closing branch must be the last in the row.  */
3015       if (JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3016 	return false;
3017 
3018        last_in_row = next_ps_i;
3019     }
3020 
3021   /* The closing branch is scheduled as well.  Make sure there is no
3022      dependent instruction after it as the branch should be the last
3023      instruction in the row.  */
3024   if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3025     {
3026       if (first_must_follow)
3027 	return false;
3028       if (last_in_row)
3029 	{
3030 	  /* Make the branch the last in the row.  New instructions
3031 	     will be inserted at the beginning of the row or after the
3032 	     last must_precede instruction thus the branch is guaranteed
3033 	     to remain the last instruction in the row.  */
3034 	  last_in_row->next_in_row = ps_i;
3035 	  ps_i->prev_in_row = last_in_row;
3036 	  ps_i->next_in_row = NULL;
3037 	}
3038       else
3039 	ps->rows[row] = ps_i;
3040       return true;
3041     }
3042 
3043   /* Now insert the node after INSERT_AFTER_PSI.  */
3044 
3045   if (! last_must_precede)
3046     {
3047       ps_i->next_in_row = ps->rows[row];
3048       ps_i->prev_in_row = NULL;
3049       if (ps_i->next_in_row)
3050     	ps_i->next_in_row->prev_in_row = ps_i;
3051       ps->rows[row] = ps_i;
3052     }
3053   else
3054     {
3055       ps_i->next_in_row = last_must_precede->next_in_row;
3056       last_must_precede->next_in_row = ps_i;
3057       ps_i->prev_in_row = last_must_precede;
3058       if (ps_i->next_in_row)
3059         ps_i->next_in_row->prev_in_row = ps_i;
3060     }
3061 
3062   return true;
3063 }
3064 
3065 /* Advances the PS_INSN one column in its current row; returns false
3066    in failure and true in success.  Bit N is set in MUST_FOLLOW if
3067    the node with cuid N must be come after the node pointed to by
3068    PS_I when scheduled in the same cycle.  */
3069 static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3070 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3071 			sbitmap must_follow)
3072 {
3073   ps_insn_ptr prev, next;
3074   int row;
3075 
3076   if (!ps || !ps_i)
3077     return false;
3078 
3079   row = SMODULO (ps_i->cycle, ps->ii);
3080 
3081   if (! ps_i->next_in_row)
3082     return false;
3083 
3084   /* Check if next_in_row is dependent on ps_i, both having same sched
3085      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
3086   if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3087     return false;
3088 
3089   /* Advance PS_I over its next_in_row in the doubly linked list.  */
3090   prev = ps_i->prev_in_row;
3091   next = ps_i->next_in_row;
3092 
3093   if (ps_i == ps->rows[row])
3094     ps->rows[row] = next;
3095 
3096   ps_i->next_in_row = next->next_in_row;
3097 
3098   if (next->next_in_row)
3099     next->next_in_row->prev_in_row = ps_i;
3100 
3101   next->next_in_row = ps_i;
3102   ps_i->prev_in_row = next;
3103 
3104   next->prev_in_row = prev;
3105   if (prev)
3106     prev->next_in_row = next;
3107 
3108   return true;
3109 }
3110 
3111 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3112    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
3113    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3114    before/after (respectively) the node pointed to by PS_I when scheduled
3115    in the same cycle.  */
3116 static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3117 add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3118 		sbitmap must_precede, sbitmap must_follow)
3119 {
3120   ps_insn_ptr ps_i;
3121   int row = SMODULO (cycle, ps->ii);
3122 
3123   if (ps->rows_length[row] >= issue_rate)
3124     return NULL;
3125 
3126   ps_i = create_ps_insn (id, cycle);
3127 
3128   /* Finds and inserts PS_I according to MUST_FOLLOW and
3129      MUST_PRECEDE.  */
3130   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3131     {
3132       free (ps_i);
3133       return NULL;
3134     }
3135 
3136   ps->rows_length[row] += 1;
3137   return ps_i;
3138 }
3139 
3140 /* Advance time one cycle.  Assumes DFA is being used.  */
3141 static void
advance_one_cycle(void)3142 advance_one_cycle (void)
3143 {
3144   if (targetm.sched.dfa_pre_cycle_insn)
3145     state_transition (curr_state,
3146 		      targetm.sched.dfa_pre_cycle_insn ());
3147 
3148   state_transition (curr_state, NULL);
3149 
3150   if (targetm.sched.dfa_post_cycle_insn)
3151     state_transition (curr_state,
3152 		      targetm.sched.dfa_post_cycle_insn ());
3153 }
3154 
3155 
3156 
3157 /* Checks if PS has resource conflicts according to DFA, starting from
3158    FROM cycle to TO cycle; returns true if there are conflicts and false
3159    if there are no conflicts.  Assumes DFA is being used.  */
3160 static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3161 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3162 {
3163   int cycle;
3164 
3165   state_reset (curr_state);
3166 
3167   for (cycle = from; cycle <= to; cycle++)
3168     {
3169       ps_insn_ptr crr_insn;
3170       /* Holds the remaining issue slots in the current row.  */
3171       int can_issue_more = issue_rate;
3172 
3173       /* Walk through the DFA for the current row.  */
3174       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3175 	   crr_insn;
3176 	   crr_insn = crr_insn->next_in_row)
3177 	{
3178 	  rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
3179 
3180 	  /* Check if there is room for the current insn.  */
3181 	  if (!can_issue_more || state_dead_lock_p (curr_state))
3182 	    return true;
3183 
3184 	  /* Update the DFA state and return with failure if the DFA found
3185 	     resource conflicts.  */
3186 	  if (state_transition (curr_state, insn) >= 0)
3187 	    return true;
3188 
3189 	  if (targetm.sched.variable_issue)
3190 	    can_issue_more =
3191 	      targetm.sched.variable_issue (sched_dump, sched_verbose,
3192 					    insn, can_issue_more);
3193 	  /* A naked CLOBBER or USE generates no instruction, so don't
3194 	     let them consume issue slots.  */
3195 	  else if (GET_CODE (PATTERN (insn)) != USE
3196 		   && GET_CODE (PATTERN (insn)) != CLOBBER)
3197 	    can_issue_more--;
3198 	}
3199 
3200       /* Advance the DFA to the next cycle.  */
3201       advance_one_cycle ();
3202     }
3203   return false;
3204 }
3205 
3206 /* Checks if the given node causes resource conflicts when added to PS at
3207    cycle C.  If not the node is added to PS and returned; otherwise zero
3208    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3209    cuid N must be come before/after (respectively) the node pointed to by
3210    PS_I when scheduled in the same cycle.  */
3211 ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3212 ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3213    			     int c, sbitmap must_precede,
3214 			     sbitmap must_follow)
3215 {
3216   int i, first, amount, has_conflicts = 0;
3217   ps_insn_ptr ps_i;
3218 
3219   /* First add the node to the PS, if this succeeds check for
3220      conflicts, trying different issue slots in the same row.  */
3221   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3222     return NULL; /* Failed to insert the node at the given cycle.  */
3223 
3224   while (1)
3225     {
3226       has_conflicts = ps_has_conflicts (ps, c, c);
3227       if (ps->history > 0 && !has_conflicts)
3228 	{
3229 	  /* Check all 2h+1 intervals, starting from c-2h..c up to c..2h,
3230 	     but not more than ii intervals.  */
3231 	  first = c - ps->history;
3232 	  amount = 2 * ps->history + 1;
3233 	  if (amount > ps->ii)
3234 	    amount = ps->ii;
3235 	  for (i = first; i < first + amount; i++)
3236 	    {
3237 	      has_conflicts = ps_has_conflicts (ps,
3238 						i - ps->history,
3239 						i + ps->history);
3240 	      if (has_conflicts)
3241 		break;
3242 	    }
3243 	}
3244       if (!has_conflicts)
3245 	break;
3246       /* Try different issue slots to find one that the given node can be
3247 	 scheduled in without conflicts.  */
3248       if (! ps_insn_advance_column (ps, ps_i, must_follow))
3249 	break;
3250     }
3251 
3252   if (has_conflicts)
3253     {
3254       remove_node_from_ps (ps, ps_i);
3255       return NULL;
3256     }
3257 
3258   ps->min_cycle = MIN (ps->min_cycle, c);
3259   ps->max_cycle = MAX (ps->max_cycle, c);
3260   return ps_i;
3261 }
3262 
3263 /* Calculate the stage count of the partial schedule PS.  The calculation
3264    takes into account the rotation amount passed in ROTATION_AMOUNT.  */
3265 int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3266 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3267 {
3268   int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3269   int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3270   int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3271 
3272   /* The calculation of stage count is done adding the number of stages
3273      before cycle zero and after cycle zero.  */
3274   stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3275 
3276   return stage_count;
3277 }
3278 
3279 /* Rotate the rows of PS such that insns scheduled at time
3280    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
3281 void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3282 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3283 {
3284   int i, row, backward_rotates;
3285   int last_row = ps->ii - 1;
3286 
3287   if (start_cycle == 0)
3288     return;
3289 
3290   backward_rotates = SMODULO (start_cycle, ps->ii);
3291 
3292   /* Revisit later and optimize this into a single loop.  */
3293   for (i = 0; i < backward_rotates; i++)
3294     {
3295       ps_insn_ptr first_row = ps->rows[0];
3296       int first_row_length = ps->rows_length[0];
3297 
3298       for (row = 0; row < last_row; row++)
3299 	{
3300 	  ps->rows[row] = ps->rows[row + 1];
3301 	  ps->rows_length[row] = ps->rows_length[row + 1];
3302 	}
3303 
3304       ps->rows[last_row] = first_row;
3305       ps->rows_length[last_row] = first_row_length;
3306     }
3307 
3308   ps->max_cycle -= start_cycle;
3309   ps->min_cycle -= start_cycle;
3310 }
3311 
3312 #endif /* INSN_SCHEDULING */
3313 
3314 /* Run instruction scheduler.  */
3315 /* Perform SMS module scheduling.  */
3316 
3317 namespace {
3318 
3319 const pass_data pass_data_sms =
3320 {
3321   RTL_PASS, /* type */
3322   "sms", /* name */
3323   OPTGROUP_NONE, /* optinfo_flags */
3324   TV_SMS, /* tv_id */
3325   0, /* properties_required */
3326   0, /* properties_provided */
3327   0, /* properties_destroyed */
3328   0, /* todo_flags_start */
3329   TODO_df_finish, /* todo_flags_finish */
3330 };
3331 
3332 class pass_sms : public rtl_opt_pass
3333 {
3334 public:
pass_sms(gcc::context * ctxt)3335   pass_sms (gcc::context *ctxt)
3336     : rtl_opt_pass (pass_data_sms, ctxt)
3337   {}
3338 
3339   /* opt_pass methods: */
gate(function *)3340   virtual bool gate (function *)
3341 {
3342   return (optimize > 0 && flag_modulo_sched);
3343 }
3344 
3345   virtual unsigned int execute (function *);
3346 
3347 }; // class pass_sms
3348 
3349 unsigned int
execute(function * fun ATTRIBUTE_UNUSED)3350 pass_sms::execute (function *fun ATTRIBUTE_UNUSED)
3351 {
3352 #ifdef INSN_SCHEDULING
3353   basic_block bb;
3354 
3355   /* Collect loop information to be used in SMS.  */
3356   cfg_layout_initialize (0);
3357   sms_schedule ();
3358 
3359   /* Update the life information, because we add pseudos.  */
3360   max_regno = max_reg_num ();
3361 
3362   /* Finalize layout changes.  */
3363   FOR_EACH_BB_FN (bb, fun)
3364     if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
3365       bb->aux = bb->next_bb;
3366   free_dominance_info (CDI_DOMINATORS);
3367   cfg_layout_finalize ();
3368 #endif /* INSN_SCHEDULING */
3369   return 0;
3370 }
3371 
3372 } // anon namespace
3373 
3374 rtl_opt_pass *
make_pass_sms(gcc::context * ctxt)3375 make_pass_sms (gcc::context *ctxt)
3376 {
3377   return new pass_sms (ctxt);
3378 }
3379