1*38fd1498Szrj /* Swing Modulo Scheduling implementation.
2*38fd1498Szrj Copyright (C) 2004-2018 Free Software Foundation, Inc.
3*38fd1498Szrj Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
4*38fd1498Szrj
5*38fd1498Szrj This file is part of GCC.
6*38fd1498Szrj
7*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
8*38fd1498Szrj the terms of the GNU General Public License as published by the Free
9*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
10*38fd1498Szrj version.
11*38fd1498Szrj
12*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15*38fd1498Szrj for more details.
16*38fd1498Szrj
17*38fd1498Szrj You should have received a copy of the GNU General Public License
18*38fd1498Szrj along with GCC; see the file COPYING3. If not see
19*38fd1498Szrj <http://www.gnu.org/licenses/>. */
20*38fd1498Szrj
21*38fd1498Szrj
22*38fd1498Szrj #include "config.h"
23*38fd1498Szrj #include "system.h"
24*38fd1498Szrj #include "coretypes.h"
25*38fd1498Szrj #include "backend.h"
26*38fd1498Szrj #include "target.h"
27*38fd1498Szrj #include "rtl.h"
28*38fd1498Szrj #include "tree.h"
29*38fd1498Szrj #include "cfghooks.h"
30*38fd1498Szrj #include "df.h"
31*38fd1498Szrj #include "memmodel.h"
32*38fd1498Szrj #include "optabs.h"
33*38fd1498Szrj #include "regs.h"
34*38fd1498Szrj #include "emit-rtl.h"
35*38fd1498Szrj #include "gcov-io.h"
36*38fd1498Szrj #include "profile.h"
37*38fd1498Szrj #include "insn-attr.h"
38*38fd1498Szrj #include "cfgrtl.h"
39*38fd1498Szrj #include "sched-int.h"
40*38fd1498Szrj #include "cfgloop.h"
41*38fd1498Szrj #include "expr.h"
42*38fd1498Szrj #include "params.h"
43*38fd1498Szrj #include "ddg.h"
44*38fd1498Szrj #include "tree-pass.h"
45*38fd1498Szrj #include "dbgcnt.h"
46*38fd1498Szrj #include "loop-unroll.h"
47*38fd1498Szrj
48*38fd1498Szrj #ifdef INSN_SCHEDULING
49*38fd1498Szrj
50*38fd1498Szrj /* This file contains the implementation of the Swing Modulo Scheduler,
51*38fd1498Szrj described in the following references:
52*38fd1498Szrj [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
53*38fd1498Szrj Lifetime--sensitive modulo scheduling in a production environment.
54*38fd1498Szrj IEEE Trans. on Comps., 50(3), March 2001
55*38fd1498Szrj [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
56*38fd1498Szrj Swing Modulo Scheduling: A Lifetime Sensitive Approach.
57*38fd1498Szrj PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
58*38fd1498Szrj
59*38fd1498Szrj The basic structure is:
60*38fd1498Szrj 1. Build a data-dependence graph (DDG) for each loop.
61*38fd1498Szrj 2. Use the DDG to order the insns of a loop (not in topological order
62*38fd1498Szrj necessarily, but rather) trying to place each insn after all its
63*38fd1498Szrj predecessors _or_ after all its successors.
64*38fd1498Szrj 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
65*38fd1498Szrj 4. Use the ordering to perform list-scheduling of the loop:
66*38fd1498Szrj 1. Set II = MII. We will try to schedule the loop within II cycles.
67*38fd1498Szrj 2. Try to schedule the insns one by one according to the ordering.
68*38fd1498Szrj For each insn compute an interval of cycles by considering already-
69*38fd1498Szrj scheduled preds and succs (and associated latencies); try to place
70*38fd1498Szrj the insn in the cycles of this window checking for potential
71*38fd1498Szrj resource conflicts (using the DFA interface).
72*38fd1498Szrj Note: this is different from the cycle-scheduling of schedule_insns;
73*38fd1498Szrj here the insns are not scheduled monotonically top-down (nor bottom-
74*38fd1498Szrj up).
75*38fd1498Szrj 3. If failed in scheduling all insns - bump II++ and try again, unless
76*38fd1498Szrj II reaches an upper bound MaxII, in which case report failure.
77*38fd1498Szrj 5. If we succeeded in scheduling the loop within II cycles, we now
78*38fd1498Szrj generate prolog and epilog, decrease the counter of the loop, and
79*38fd1498Szrj perform modulo variable expansion for live ranges that span more than
80*38fd1498Szrj II cycles (i.e. use register copies to prevent a def from overwriting
81*38fd1498Szrj itself before reaching the use).
82*38fd1498Szrj
83*38fd1498Szrj SMS works with countable loops (1) whose control part can be easily
84*38fd1498Szrj decoupled from the rest of the loop and (2) whose loop count can
85*38fd1498Szrj be easily adjusted. This is because we peel a constant number of
86*38fd1498Szrj iterations into a prologue and epilogue for which we want to avoid
87*38fd1498Szrj emitting the control part, and a kernel which is to iterate that
88*38fd1498Szrj constant number of iterations less than the original loop. So the
89*38fd1498Szrj control part should be a set of insns clearly identified and having
90*38fd1498Szrj its own iv, not otherwise used in the loop (at-least for now), which
91*38fd1498Szrj initializes a register before the loop to the number of iterations.
92*38fd1498Szrj Currently SMS relies on the do-loop pattern to recognize such loops,
93*38fd1498Szrj where (1) the control part comprises of all insns defining and/or
94*38fd1498Szrj using a certain 'count' register and (2) the loop count can be
95*38fd1498Szrj adjusted by modifying this register prior to the loop.
96*38fd1498Szrj TODO: Rely on cfgloop analysis instead. */
97*38fd1498Szrj
98*38fd1498Szrj /* This page defines partial-schedule structures and functions for
99*38fd1498Szrj modulo scheduling. */
100*38fd1498Szrj
101*38fd1498Szrj typedef struct partial_schedule *partial_schedule_ptr;
102*38fd1498Szrj typedef struct ps_insn *ps_insn_ptr;
103*38fd1498Szrj
104*38fd1498Szrj /* The minimum (absolute) cycle that a node of ps was scheduled in. */
105*38fd1498Szrj #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
106*38fd1498Szrj
107*38fd1498Szrj /* The maximum (absolute) cycle that a node of ps was scheduled in. */
108*38fd1498Szrj #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
109*38fd1498Szrj
110*38fd1498Szrj /* Perform signed modulo, always returning a non-negative value. */
111*38fd1498Szrj #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
112*38fd1498Szrj
113*38fd1498Szrj /* The number of different iterations the nodes in ps span, assuming
114*38fd1498Szrj the stage boundaries are placed efficiently. */
115*38fd1498Szrj #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
116*38fd1498Szrj + 1 + ii - 1) / ii)
117*38fd1498Szrj /* The stage count of ps. */
118*38fd1498Szrj #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
119*38fd1498Szrj
120*38fd1498Szrj /* A single instruction in the partial schedule. */
121*38fd1498Szrj struct ps_insn
122*38fd1498Szrj {
123*38fd1498Szrj /* Identifies the instruction to be scheduled. Values smaller than
124*38fd1498Szrj the ddg's num_nodes refer directly to ddg nodes. A value of
125*38fd1498Szrj X - num_nodes refers to register move X. */
126*38fd1498Szrj int id;
127*38fd1498Szrj
128*38fd1498Szrj /* The (absolute) cycle in which the PS instruction is scheduled.
129*38fd1498Szrj Same as SCHED_TIME (node). */
130*38fd1498Szrj int cycle;
131*38fd1498Szrj
132*38fd1498Szrj /* The next/prev PS_INSN in the same row. */
133*38fd1498Szrj ps_insn_ptr next_in_row,
134*38fd1498Szrj prev_in_row;
135*38fd1498Szrj
136*38fd1498Szrj };
137*38fd1498Szrj
138*38fd1498Szrj /* Information about a register move that has been added to a partial
139*38fd1498Szrj schedule. */
140*38fd1498Szrj struct ps_reg_move_info
141*38fd1498Szrj {
142*38fd1498Szrj /* The source of the move is defined by the ps_insn with id DEF.
143*38fd1498Szrj The destination is used by the ps_insns with the ids in USES. */
144*38fd1498Szrj int def;
145*38fd1498Szrj sbitmap uses;
146*38fd1498Szrj
147*38fd1498Szrj /* The original form of USES' instructions used OLD_REG, but they
148*38fd1498Szrj should now use NEW_REG. */
149*38fd1498Szrj rtx old_reg;
150*38fd1498Szrj rtx new_reg;
151*38fd1498Szrj
152*38fd1498Szrj /* The number of consecutive stages that the move occupies. */
153*38fd1498Szrj int num_consecutive_stages;
154*38fd1498Szrj
155*38fd1498Szrj /* An instruction that sets NEW_REG to the correct value. The first
156*38fd1498Szrj move associated with DEF will have an rhs of OLD_REG; later moves
157*38fd1498Szrj use the result of the previous move. */
158*38fd1498Szrj rtx_insn *insn;
159*38fd1498Szrj };
160*38fd1498Szrj
161*38fd1498Szrj /* Holds the partial schedule as an array of II rows. Each entry of the
162*38fd1498Szrj array points to a linked list of PS_INSNs, which represents the
163*38fd1498Szrj instructions that are scheduled for that row. */
164*38fd1498Szrj struct partial_schedule
165*38fd1498Szrj {
166*38fd1498Szrj int ii; /* Number of rows in the partial schedule. */
167*38fd1498Szrj int history; /* Threshold for conflict checking using DFA. */
168*38fd1498Szrj
169*38fd1498Szrj /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
170*38fd1498Szrj ps_insn_ptr *rows;
171*38fd1498Szrj
172*38fd1498Szrj /* All the moves added for this partial schedule. Index X has
173*38fd1498Szrj a ps_insn id of X + g->num_nodes. */
174*38fd1498Szrj vec<ps_reg_move_info> reg_moves;
175*38fd1498Szrj
176*38fd1498Szrj /* rows_length[i] holds the number of instructions in the row.
177*38fd1498Szrj It is used only (as an optimization) to back off quickly from
178*38fd1498Szrj trying to schedule a node in a full row; that is, to avoid running
179*38fd1498Szrj through futile DFA state transitions. */
180*38fd1498Szrj int *rows_length;
181*38fd1498Szrj
182*38fd1498Szrj /* The earliest absolute cycle of an insn in the partial schedule. */
183*38fd1498Szrj int min_cycle;
184*38fd1498Szrj
185*38fd1498Szrj /* The latest absolute cycle of an insn in the partial schedule. */
186*38fd1498Szrj int max_cycle;
187*38fd1498Szrj
188*38fd1498Szrj ddg_ptr g; /* The DDG of the insns in the partial schedule. */
189*38fd1498Szrj
190*38fd1498Szrj int stage_count; /* The stage count of the partial schedule. */
191*38fd1498Szrj };
192*38fd1498Szrj
193*38fd1498Szrj
194*38fd1498Szrj static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
195*38fd1498Szrj static void free_partial_schedule (partial_schedule_ptr);
196*38fd1498Szrj static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
197*38fd1498Szrj void print_partial_schedule (partial_schedule_ptr, FILE *);
198*38fd1498Szrj static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
199*38fd1498Szrj static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
200*38fd1498Szrj int, int, sbitmap, sbitmap);
201*38fd1498Szrj static void rotate_partial_schedule (partial_schedule_ptr, int);
202*38fd1498Szrj void set_row_column_for_ps (partial_schedule_ptr);
203*38fd1498Szrj static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
204*38fd1498Szrj static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
205*38fd1498Szrj
206*38fd1498Szrj
207*38fd1498Szrj /* This page defines constants and structures for the modulo scheduling
208*38fd1498Szrj driver. */
209*38fd1498Szrj
210*38fd1498Szrj static int sms_order_nodes (ddg_ptr, int, int *, int *);
211*38fd1498Szrj static void set_node_sched_params (ddg_ptr);
212*38fd1498Szrj static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
213*38fd1498Szrj static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
214*38fd1498Szrj static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
215*38fd1498Szrj rtx, rtx);
216*38fd1498Szrj static int calculate_stage_count (partial_schedule_ptr, int);
217*38fd1498Szrj static void calculate_must_precede_follow (ddg_node_ptr, int, int,
218*38fd1498Szrj int, int, sbitmap, sbitmap, sbitmap);
219*38fd1498Szrj static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
220*38fd1498Szrj sbitmap, int, int *, int *, int *);
221*38fd1498Szrj static bool try_scheduling_node_in_cycle (partial_schedule_ptr, int, int,
222*38fd1498Szrj sbitmap, int *, sbitmap, sbitmap);
223*38fd1498Szrj static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
224*38fd1498Szrj
225*38fd1498Szrj #define NODE_ASAP(node) ((node)->aux.count)
226*38fd1498Szrj
227*38fd1498Szrj #define SCHED_PARAMS(x) (&node_sched_param_vec[x])
228*38fd1498Szrj #define SCHED_TIME(x) (SCHED_PARAMS (x)->time)
229*38fd1498Szrj #define SCHED_ROW(x) (SCHED_PARAMS (x)->row)
230*38fd1498Szrj #define SCHED_STAGE(x) (SCHED_PARAMS (x)->stage)
231*38fd1498Szrj #define SCHED_COLUMN(x) (SCHED_PARAMS (x)->column)
232*38fd1498Szrj
233*38fd1498Szrj /* The scheduling parameters held for each node. */
234*38fd1498Szrj typedef struct node_sched_params
235*38fd1498Szrj {
236*38fd1498Szrj int time; /* The absolute scheduling cycle. */
237*38fd1498Szrj
238*38fd1498Szrj int row; /* Holds time % ii. */
239*38fd1498Szrj int stage; /* Holds time / ii. */
240*38fd1498Szrj
241*38fd1498Szrj /* The column of a node inside the ps. If nodes u, v are on the same row,
242*38fd1498Szrj u will precede v if column (u) < column (v). */
243*38fd1498Szrj int column;
244*38fd1498Szrj } *node_sched_params_ptr;
245*38fd1498Szrj
246*38fd1498Szrj /* The following three functions are copied from the current scheduler
247*38fd1498Szrj code in order to use sched_analyze() for computing the dependencies.
248*38fd1498Szrj They are used when initializing the sched_info structure. */
249*38fd1498Szrj static const char *
sms_print_insn(const rtx_insn * insn,int aligned ATTRIBUTE_UNUSED)250*38fd1498Szrj sms_print_insn (const rtx_insn *insn, int aligned ATTRIBUTE_UNUSED)
251*38fd1498Szrj {
252*38fd1498Szrj static char tmp[80];
253*38fd1498Szrj
254*38fd1498Szrj sprintf (tmp, "i%4d", INSN_UID (insn));
255*38fd1498Szrj return tmp;
256*38fd1498Szrj }
257*38fd1498Szrj
258*38fd1498Szrj static void
compute_jump_reg_dependencies(rtx insn ATTRIBUTE_UNUSED,regset used ATTRIBUTE_UNUSED)259*38fd1498Szrj compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
260*38fd1498Szrj regset used ATTRIBUTE_UNUSED)
261*38fd1498Szrj {
262*38fd1498Szrj }
263*38fd1498Szrj
264*38fd1498Szrj static struct common_sched_info_def sms_common_sched_info;
265*38fd1498Szrj
266*38fd1498Szrj static struct sched_deps_info_def sms_sched_deps_info =
267*38fd1498Szrj {
268*38fd1498Szrj compute_jump_reg_dependencies,
269*38fd1498Szrj NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
270*38fd1498Szrj NULL,
271*38fd1498Szrj 0, 0, 0
272*38fd1498Szrj };
273*38fd1498Szrj
274*38fd1498Szrj static struct haifa_sched_info sms_sched_info =
275*38fd1498Szrj {
276*38fd1498Szrj NULL,
277*38fd1498Szrj NULL,
278*38fd1498Szrj NULL,
279*38fd1498Szrj NULL,
280*38fd1498Szrj NULL,
281*38fd1498Szrj sms_print_insn,
282*38fd1498Szrj NULL,
283*38fd1498Szrj NULL, /* insn_finishes_block_p */
284*38fd1498Szrj NULL, NULL,
285*38fd1498Szrj NULL, NULL,
286*38fd1498Szrj 0, 0,
287*38fd1498Szrj
288*38fd1498Szrj NULL, NULL, NULL, NULL,
289*38fd1498Szrj NULL, NULL,
290*38fd1498Szrj 0
291*38fd1498Szrj };
292*38fd1498Szrj
293*38fd1498Szrj /* Partial schedule instruction ID in PS is a register move. Return
294*38fd1498Szrj information about it. */
295*38fd1498Szrj static struct ps_reg_move_info *
ps_reg_move(partial_schedule_ptr ps,int id)296*38fd1498Szrj ps_reg_move (partial_schedule_ptr ps, int id)
297*38fd1498Szrj {
298*38fd1498Szrj gcc_checking_assert (id >= ps->g->num_nodes);
299*38fd1498Szrj return &ps->reg_moves[id - ps->g->num_nodes];
300*38fd1498Szrj }
301*38fd1498Szrj
302*38fd1498Szrj /* Return the rtl instruction that is being scheduled by partial schedule
303*38fd1498Szrj instruction ID, which belongs to schedule PS. */
304*38fd1498Szrj static rtx_insn *
ps_rtl_insn(partial_schedule_ptr ps,int id)305*38fd1498Szrj ps_rtl_insn (partial_schedule_ptr ps, int id)
306*38fd1498Szrj {
307*38fd1498Szrj if (id < ps->g->num_nodes)
308*38fd1498Szrj return ps->g->nodes[id].insn;
309*38fd1498Szrj else
310*38fd1498Szrj return ps_reg_move (ps, id)->insn;
311*38fd1498Szrj }
312*38fd1498Szrj
313*38fd1498Szrj /* Partial schedule instruction ID, which belongs to PS, occurred in
314*38fd1498Szrj the original (unscheduled) loop. Return the first instruction
315*38fd1498Szrj in the loop that was associated with ps_rtl_insn (PS, ID).
316*38fd1498Szrj If the instruction had some notes before it, this is the first
317*38fd1498Szrj of those notes. */
318*38fd1498Szrj static rtx_insn *
ps_first_note(partial_schedule_ptr ps,int id)319*38fd1498Szrj ps_first_note (partial_schedule_ptr ps, int id)
320*38fd1498Szrj {
321*38fd1498Szrj gcc_assert (id < ps->g->num_nodes);
322*38fd1498Szrj return ps->g->nodes[id].first_note;
323*38fd1498Szrj }
324*38fd1498Szrj
325*38fd1498Szrj /* Return the number of consecutive stages that are occupied by
326*38fd1498Szrj partial schedule instruction ID in PS. */
327*38fd1498Szrj static int
ps_num_consecutive_stages(partial_schedule_ptr ps,int id)328*38fd1498Szrj ps_num_consecutive_stages (partial_schedule_ptr ps, int id)
329*38fd1498Szrj {
330*38fd1498Szrj if (id < ps->g->num_nodes)
331*38fd1498Szrj return 1;
332*38fd1498Szrj else
333*38fd1498Szrj return ps_reg_move (ps, id)->num_consecutive_stages;
334*38fd1498Szrj }
335*38fd1498Szrj
336*38fd1498Szrj /* Given HEAD and TAIL which are the first and last insns in a loop;
337*38fd1498Szrj return the register which controls the loop. Return zero if it has
338*38fd1498Szrj more than one occurrence in the loop besides the control part or the
339*38fd1498Szrj do-loop pattern is not of the form we expect. */
340*38fd1498Szrj static rtx
doloop_register_get(rtx_insn * head,rtx_insn * tail)341*38fd1498Szrj doloop_register_get (rtx_insn *head, rtx_insn *tail)
342*38fd1498Szrj {
343*38fd1498Szrj rtx reg, condition;
344*38fd1498Szrj rtx_insn *insn, *first_insn_not_to_check;
345*38fd1498Szrj
346*38fd1498Szrj if (!JUMP_P (tail))
347*38fd1498Szrj return NULL_RTX;
348*38fd1498Szrj
349*38fd1498Szrj if (!targetm.code_for_doloop_end)
350*38fd1498Szrj return NULL_RTX;
351*38fd1498Szrj
352*38fd1498Szrj /* TODO: Free SMS's dependence on doloop_condition_get. */
353*38fd1498Szrj condition = doloop_condition_get (tail);
354*38fd1498Szrj if (! condition)
355*38fd1498Szrj return NULL_RTX;
356*38fd1498Szrj
357*38fd1498Szrj if (REG_P (XEXP (condition, 0)))
358*38fd1498Szrj reg = XEXP (condition, 0);
359*38fd1498Szrj else if (GET_CODE (XEXP (condition, 0)) == PLUS
360*38fd1498Szrj && REG_P (XEXP (XEXP (condition, 0), 0)))
361*38fd1498Szrj reg = XEXP (XEXP (condition, 0), 0);
362*38fd1498Szrj else
363*38fd1498Szrj gcc_unreachable ();
364*38fd1498Szrj
365*38fd1498Szrj /* Check that the COUNT_REG has no other occurrences in the loop
366*38fd1498Szrj until the decrement. We assume the control part consists of
367*38fd1498Szrj either a single (parallel) branch-on-count or a (non-parallel)
368*38fd1498Szrj branch immediately preceded by a single (decrement) insn. */
369*38fd1498Szrj first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
370*38fd1498Szrj : prev_nondebug_insn (tail));
371*38fd1498Szrj
372*38fd1498Szrj for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
373*38fd1498Szrj if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
374*38fd1498Szrj {
375*38fd1498Szrj if (dump_file)
376*38fd1498Szrj {
377*38fd1498Szrj fprintf (dump_file, "SMS count_reg found ");
378*38fd1498Szrj print_rtl_single (dump_file, reg);
379*38fd1498Szrj fprintf (dump_file, " outside control in insn:\n");
380*38fd1498Szrj print_rtl_single (dump_file, insn);
381*38fd1498Szrj }
382*38fd1498Szrj
383*38fd1498Szrj return NULL_RTX;
384*38fd1498Szrj }
385*38fd1498Szrj
386*38fd1498Szrj return reg;
387*38fd1498Szrj }
388*38fd1498Szrj
389*38fd1498Szrj /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
390*38fd1498Szrj that the number of iterations is a compile-time constant. If so,
391*38fd1498Szrj return the rtx_insn that sets COUNT_REG to a constant, and set COUNT to
392*38fd1498Szrj this constant. Otherwise return 0. */
393*38fd1498Szrj static rtx_insn *
const_iteration_count(rtx count_reg,basic_block pre_header,int64_t * count)394*38fd1498Szrj const_iteration_count (rtx count_reg, basic_block pre_header,
395*38fd1498Szrj int64_t * count)
396*38fd1498Szrj {
397*38fd1498Szrj rtx_insn *insn;
398*38fd1498Szrj rtx_insn *head, *tail;
399*38fd1498Szrj
400*38fd1498Szrj if (! pre_header)
401*38fd1498Szrj return NULL;
402*38fd1498Szrj
403*38fd1498Szrj get_ebb_head_tail (pre_header, pre_header, &head, &tail);
404*38fd1498Szrj
405*38fd1498Szrj for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
406*38fd1498Szrj if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
407*38fd1498Szrj rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
408*38fd1498Szrj {
409*38fd1498Szrj rtx pat = single_set (insn);
410*38fd1498Szrj
411*38fd1498Szrj if (CONST_INT_P (SET_SRC (pat)))
412*38fd1498Szrj {
413*38fd1498Szrj *count = INTVAL (SET_SRC (pat));
414*38fd1498Szrj return insn;
415*38fd1498Szrj }
416*38fd1498Szrj
417*38fd1498Szrj return NULL;
418*38fd1498Szrj }
419*38fd1498Szrj
420*38fd1498Szrj return NULL;
421*38fd1498Szrj }
422*38fd1498Szrj
423*38fd1498Szrj /* A very simple resource-based lower bound on the initiation interval.
424*38fd1498Szrj ??? Improve the accuracy of this bound by considering the
425*38fd1498Szrj utilization of various units. */
426*38fd1498Szrj static int
res_MII(ddg_ptr g)427*38fd1498Szrj res_MII (ddg_ptr g)
428*38fd1498Szrj {
429*38fd1498Szrj if (targetm.sched.sms_res_mii)
430*38fd1498Szrj return targetm.sched.sms_res_mii (g);
431*38fd1498Szrj
432*38fd1498Szrj return ((g->num_nodes - g->num_debug) / issue_rate);
433*38fd1498Szrj }
434*38fd1498Szrj
435*38fd1498Szrj
436*38fd1498Szrj /* A vector that contains the sched data for each ps_insn. */
437*38fd1498Szrj static vec<node_sched_params> node_sched_param_vec;
438*38fd1498Szrj
439*38fd1498Szrj /* Allocate sched_params for each node and initialize it. */
440*38fd1498Szrj static void
set_node_sched_params(ddg_ptr g)441*38fd1498Szrj set_node_sched_params (ddg_ptr g)
442*38fd1498Szrj {
443*38fd1498Szrj node_sched_param_vec.truncate (0);
444*38fd1498Szrj node_sched_param_vec.safe_grow_cleared (g->num_nodes);
445*38fd1498Szrj }
446*38fd1498Szrj
447*38fd1498Szrj /* Make sure that node_sched_param_vec has an entry for every move in PS. */
448*38fd1498Szrj static void
extend_node_sched_params(partial_schedule_ptr ps)449*38fd1498Szrj extend_node_sched_params (partial_schedule_ptr ps)
450*38fd1498Szrj {
451*38fd1498Szrj node_sched_param_vec.safe_grow_cleared (ps->g->num_nodes
452*38fd1498Szrj + ps->reg_moves.length ());
453*38fd1498Szrj }
454*38fd1498Szrj
455*38fd1498Szrj /* Update the sched_params (time, row and stage) for node U using the II,
456*38fd1498Szrj the CYCLE of U and MIN_CYCLE.
457*38fd1498Szrj We're not simply taking the following
458*38fd1498Szrj SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
459*38fd1498Szrj because the stages may not be aligned on cycle 0. */
460*38fd1498Szrj static void
update_node_sched_params(int u,int ii,int cycle,int min_cycle)461*38fd1498Szrj update_node_sched_params (int u, int ii, int cycle, int min_cycle)
462*38fd1498Szrj {
463*38fd1498Szrj int sc_until_cycle_zero;
464*38fd1498Szrj int stage;
465*38fd1498Szrj
466*38fd1498Szrj SCHED_TIME (u) = cycle;
467*38fd1498Szrj SCHED_ROW (u) = SMODULO (cycle, ii);
468*38fd1498Szrj
469*38fd1498Szrj /* The calculation of stage count is done adding the number
470*38fd1498Szrj of stages before cycle zero and after cycle zero. */
471*38fd1498Szrj sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
472*38fd1498Szrj
473*38fd1498Szrj if (SCHED_TIME (u) < 0)
474*38fd1498Szrj {
475*38fd1498Szrj stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
476*38fd1498Szrj SCHED_STAGE (u) = sc_until_cycle_zero - stage;
477*38fd1498Szrj }
478*38fd1498Szrj else
479*38fd1498Szrj {
480*38fd1498Szrj stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
481*38fd1498Szrj SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
482*38fd1498Szrj }
483*38fd1498Szrj }
484*38fd1498Szrj
485*38fd1498Szrj static void
print_node_sched_params(FILE * file,int num_nodes,partial_schedule_ptr ps)486*38fd1498Szrj print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
487*38fd1498Szrj {
488*38fd1498Szrj int i;
489*38fd1498Szrj
490*38fd1498Szrj if (! file)
491*38fd1498Szrj return;
492*38fd1498Szrj for (i = 0; i < num_nodes; i++)
493*38fd1498Szrj {
494*38fd1498Szrj node_sched_params_ptr nsp = SCHED_PARAMS (i);
495*38fd1498Szrj
496*38fd1498Szrj fprintf (file, "Node = %d; INSN = %d\n", i,
497*38fd1498Szrj INSN_UID (ps_rtl_insn (ps, i)));
498*38fd1498Szrj fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
499*38fd1498Szrj fprintf (file, " time = %d:\n", nsp->time);
500*38fd1498Szrj fprintf (file, " stage = %d:\n", nsp->stage);
501*38fd1498Szrj }
502*38fd1498Szrj }
503*38fd1498Szrj
504*38fd1498Szrj /* Set SCHED_COLUMN for each instruction in row ROW of PS. */
505*38fd1498Szrj static void
set_columns_for_row(partial_schedule_ptr ps,int row)506*38fd1498Szrj set_columns_for_row (partial_schedule_ptr ps, int row)
507*38fd1498Szrj {
508*38fd1498Szrj ps_insn_ptr cur_insn;
509*38fd1498Szrj int column;
510*38fd1498Szrj
511*38fd1498Szrj column = 0;
512*38fd1498Szrj for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
513*38fd1498Szrj SCHED_COLUMN (cur_insn->id) = column++;
514*38fd1498Szrj }
515*38fd1498Szrj
516*38fd1498Szrj /* Set SCHED_COLUMN for each instruction in PS. */
517*38fd1498Szrj static void
set_columns_for_ps(partial_schedule_ptr ps)518*38fd1498Szrj set_columns_for_ps (partial_schedule_ptr ps)
519*38fd1498Szrj {
520*38fd1498Szrj int row;
521*38fd1498Szrj
522*38fd1498Szrj for (row = 0; row < ps->ii; row++)
523*38fd1498Szrj set_columns_for_row (ps, row);
524*38fd1498Szrj }
525*38fd1498Szrj
526*38fd1498Szrj /* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
527*38fd1498Szrj Its single predecessor has already been scheduled, as has its
528*38fd1498Szrj ddg node successors. (The move may have also another move as its
529*38fd1498Szrj successor, in which case that successor will be scheduled later.)
530*38fd1498Szrj
531*38fd1498Szrj The move is part of a chain that satisfies register dependencies
532*38fd1498Szrj between a producing ddg node and various consuming ddg nodes.
533*38fd1498Szrj If some of these dependencies have a distance of 1 (meaning that
534*38fd1498Szrj the use is upward-exposed) then DISTANCE1_USES is nonnull and
535*38fd1498Szrj contains the set of uses with distance-1 dependencies.
536*38fd1498Szrj DISTANCE1_USES is null otherwise.
537*38fd1498Szrj
538*38fd1498Szrj MUST_FOLLOW is a scratch bitmap that is big enough to hold
539*38fd1498Szrj all current ps_insn ids.
540*38fd1498Szrj
541*38fd1498Szrj Return true on success. */
542*38fd1498Szrj static bool
schedule_reg_move(partial_schedule_ptr ps,int i_reg_move,sbitmap distance1_uses,sbitmap must_follow)543*38fd1498Szrj schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
544*38fd1498Szrj sbitmap distance1_uses, sbitmap must_follow)
545*38fd1498Szrj {
546*38fd1498Szrj unsigned int u;
547*38fd1498Szrj int this_time, this_distance, this_start, this_end, this_latency;
548*38fd1498Szrj int start, end, c, ii;
549*38fd1498Szrj sbitmap_iterator sbi;
550*38fd1498Szrj ps_reg_move_info *move;
551*38fd1498Szrj rtx_insn *this_insn;
552*38fd1498Szrj ps_insn_ptr psi;
553*38fd1498Szrj
554*38fd1498Szrj move = ps_reg_move (ps, i_reg_move);
555*38fd1498Szrj ii = ps->ii;
556*38fd1498Szrj if (dump_file)
557*38fd1498Szrj {
558*38fd1498Szrj fprintf (dump_file, "Scheduling register move INSN %d; ii = %d"
559*38fd1498Szrj ", min cycle = %d\n\n", INSN_UID (move->insn), ii,
560*38fd1498Szrj PS_MIN_CYCLE (ps));
561*38fd1498Szrj print_rtl_single (dump_file, move->insn);
562*38fd1498Szrj fprintf (dump_file, "\n%11s %11s %5s\n", "start", "end", "time");
563*38fd1498Szrj fprintf (dump_file, "=========== =========== =====\n");
564*38fd1498Szrj }
565*38fd1498Szrj
566*38fd1498Szrj start = INT_MIN;
567*38fd1498Szrj end = INT_MAX;
568*38fd1498Szrj
569*38fd1498Szrj /* For dependencies of distance 1 between a producer ddg node A
570*38fd1498Szrj and consumer ddg node B, we have a chain of dependencies:
571*38fd1498Szrj
572*38fd1498Szrj A --(T,L1,1)--> M1 --(T,L2,0)--> M2 ... --(T,Ln,0)--> B
573*38fd1498Szrj
574*38fd1498Szrj where Mi is the ith move. For dependencies of distance 0 between
575*38fd1498Szrj a producer ddg node A and consumer ddg node C, we have a chain of
576*38fd1498Szrj dependencies:
577*38fd1498Szrj
578*38fd1498Szrj A --(T,L1',0)--> M1' --(T,L2',0)--> M2' ... --(T,Ln',0)--> C
579*38fd1498Szrj
580*38fd1498Szrj where Mi' occupies the same position as Mi but occurs a stage later.
581*38fd1498Szrj We can only schedule each move once, so if we have both types of
582*38fd1498Szrj chain, we model the second as:
583*38fd1498Szrj
584*38fd1498Szrj A --(T,L1',1)--> M1 --(T,L2',0)--> M2 ... --(T,Ln',-1)--> C
585*38fd1498Szrj
586*38fd1498Szrj First handle the dependencies between the previously-scheduled
587*38fd1498Szrj predecessor and the move. */
588*38fd1498Szrj this_insn = ps_rtl_insn (ps, move->def);
589*38fd1498Szrj this_latency = insn_latency (this_insn, move->insn);
590*38fd1498Szrj this_distance = distance1_uses && move->def < ps->g->num_nodes ? 1 : 0;
591*38fd1498Szrj this_time = SCHED_TIME (move->def) - this_distance * ii;
592*38fd1498Szrj this_start = this_time + this_latency;
593*38fd1498Szrj this_end = this_time + ii;
594*38fd1498Szrj if (dump_file)
595*38fd1498Szrj fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
596*38fd1498Szrj this_start, this_end, SCHED_TIME (move->def),
597*38fd1498Szrj INSN_UID (this_insn), this_latency, this_distance,
598*38fd1498Szrj INSN_UID (move->insn));
599*38fd1498Szrj
600*38fd1498Szrj if (start < this_start)
601*38fd1498Szrj start = this_start;
602*38fd1498Szrj if (end > this_end)
603*38fd1498Szrj end = this_end;
604*38fd1498Szrj
605*38fd1498Szrj /* Handle the dependencies between the move and previously-scheduled
606*38fd1498Szrj successors. */
607*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, u, sbi)
608*38fd1498Szrj {
609*38fd1498Szrj this_insn = ps_rtl_insn (ps, u);
610*38fd1498Szrj this_latency = insn_latency (move->insn, this_insn);
611*38fd1498Szrj if (distance1_uses && !bitmap_bit_p (distance1_uses, u))
612*38fd1498Szrj this_distance = -1;
613*38fd1498Szrj else
614*38fd1498Szrj this_distance = 0;
615*38fd1498Szrj this_time = SCHED_TIME (u) + this_distance * ii;
616*38fd1498Szrj this_start = this_time - ii;
617*38fd1498Szrj this_end = this_time - this_latency;
618*38fd1498Szrj if (dump_file)
619*38fd1498Szrj fprintf (dump_file, "%11d %11d %5d %d --(T,%d,%d)--> %d\n",
620*38fd1498Szrj this_start, this_end, SCHED_TIME (u), INSN_UID (move->insn),
621*38fd1498Szrj this_latency, this_distance, INSN_UID (this_insn));
622*38fd1498Szrj
623*38fd1498Szrj if (start < this_start)
624*38fd1498Szrj start = this_start;
625*38fd1498Szrj if (end > this_end)
626*38fd1498Szrj end = this_end;
627*38fd1498Szrj }
628*38fd1498Szrj
629*38fd1498Szrj if (dump_file)
630*38fd1498Szrj {
631*38fd1498Szrj fprintf (dump_file, "----------- ----------- -----\n");
632*38fd1498Szrj fprintf (dump_file, "%11d %11d %5s %s\n", start, end, "", "(max, min)");
633*38fd1498Szrj }
634*38fd1498Szrj
635*38fd1498Szrj bitmap_clear (must_follow);
636*38fd1498Szrj bitmap_set_bit (must_follow, move->def);
637*38fd1498Szrj
638*38fd1498Szrj start = MAX (start, end - (ii - 1));
639*38fd1498Szrj for (c = end; c >= start; c--)
640*38fd1498Szrj {
641*38fd1498Szrj psi = ps_add_node_check_conflicts (ps, i_reg_move, c,
642*38fd1498Szrj move->uses, must_follow);
643*38fd1498Szrj if (psi)
644*38fd1498Szrj {
645*38fd1498Szrj update_node_sched_params (i_reg_move, ii, c, PS_MIN_CYCLE (ps));
646*38fd1498Szrj if (dump_file)
647*38fd1498Szrj fprintf (dump_file, "\nScheduled register move INSN %d at"
648*38fd1498Szrj " time %d, row %d\n\n", INSN_UID (move->insn), c,
649*38fd1498Szrj SCHED_ROW (i_reg_move));
650*38fd1498Szrj return true;
651*38fd1498Szrj }
652*38fd1498Szrj }
653*38fd1498Szrj
654*38fd1498Szrj if (dump_file)
655*38fd1498Szrj fprintf (dump_file, "\nNo available slot\n\n");
656*38fd1498Szrj
657*38fd1498Szrj return false;
658*38fd1498Szrj }
659*38fd1498Szrj
660*38fd1498Szrj /*
661*38fd1498Szrj Breaking intra-loop register anti-dependences:
662*38fd1498Szrj Each intra-loop register anti-dependence implies a cross-iteration true
663*38fd1498Szrj dependence of distance 1. Therefore, we can remove such false dependencies
664*38fd1498Szrj and figure out if the partial schedule broke them by checking if (for a
665*38fd1498Szrj true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
666*38fd1498Szrj if so generate a register move. The number of such moves is equal to:
667*38fd1498Szrj SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
668*38fd1498Szrj nreg_moves = ----------------------------------- + 1 - { dependence.
669*38fd1498Szrj ii { 1 if not.
670*38fd1498Szrj */
671*38fd1498Szrj static bool
schedule_reg_moves(partial_schedule_ptr ps)672*38fd1498Szrj schedule_reg_moves (partial_schedule_ptr ps)
673*38fd1498Szrj {
674*38fd1498Szrj ddg_ptr g = ps->g;
675*38fd1498Szrj int ii = ps->ii;
676*38fd1498Szrj int i;
677*38fd1498Szrj
678*38fd1498Szrj for (i = 0; i < g->num_nodes; i++)
679*38fd1498Szrj {
680*38fd1498Szrj ddg_node_ptr u = &g->nodes[i];
681*38fd1498Szrj ddg_edge_ptr e;
682*38fd1498Szrj int nreg_moves = 0, i_reg_move;
683*38fd1498Szrj rtx prev_reg, old_reg;
684*38fd1498Szrj int first_move;
685*38fd1498Szrj int distances[2];
686*38fd1498Szrj sbitmap distance1_uses;
687*38fd1498Szrj rtx set = single_set (u->insn);
688*38fd1498Szrj
689*38fd1498Szrj /* Skip instructions that do not set a register. */
690*38fd1498Szrj if (set && !REG_P (SET_DEST (set)))
691*38fd1498Szrj continue;
692*38fd1498Szrj
693*38fd1498Szrj /* Compute the number of reg_moves needed for u, by looking at life
694*38fd1498Szrj ranges started at u (excluding self-loops). */
695*38fd1498Szrj distances[0] = distances[1] = false;
696*38fd1498Szrj for (e = u->out; e; e = e->next_out)
697*38fd1498Szrj if (e->type == TRUE_DEP && e->dest != e->src)
698*38fd1498Szrj {
699*38fd1498Szrj int nreg_moves4e = (SCHED_TIME (e->dest->cuid)
700*38fd1498Szrj - SCHED_TIME (e->src->cuid)) / ii;
701*38fd1498Szrj
702*38fd1498Szrj if (e->distance == 1)
703*38fd1498Szrj nreg_moves4e = (SCHED_TIME (e->dest->cuid)
704*38fd1498Szrj - SCHED_TIME (e->src->cuid) + ii) / ii;
705*38fd1498Szrj
706*38fd1498Szrj /* If dest precedes src in the schedule of the kernel, then dest
707*38fd1498Szrj will read before src writes and we can save one reg_copy. */
708*38fd1498Szrj if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
709*38fd1498Szrj && SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
710*38fd1498Szrj nreg_moves4e--;
711*38fd1498Szrj
712*38fd1498Szrj if (nreg_moves4e >= 1)
713*38fd1498Szrj {
714*38fd1498Szrj /* !single_set instructions are not supported yet and
715*38fd1498Szrj thus we do not except to encounter them in the loop
716*38fd1498Szrj except from the doloop part. For the latter case
717*38fd1498Szrj we assume no regmoves are generated as the doloop
718*38fd1498Szrj instructions are tied to the branch with an edge. */
719*38fd1498Szrj gcc_assert (set);
720*38fd1498Szrj /* If the instruction contains auto-inc register then
721*38fd1498Szrj validate that the regmov is being generated for the
722*38fd1498Szrj target regsiter rather then the inc'ed register. */
723*38fd1498Szrj gcc_assert (!autoinc_var_is_used_p (u->insn, e->dest->insn));
724*38fd1498Szrj }
725*38fd1498Szrj
726*38fd1498Szrj if (nreg_moves4e)
727*38fd1498Szrj {
728*38fd1498Szrj gcc_assert (e->distance < 2);
729*38fd1498Szrj distances[e->distance] = true;
730*38fd1498Szrj }
731*38fd1498Szrj nreg_moves = MAX (nreg_moves, nreg_moves4e);
732*38fd1498Szrj }
733*38fd1498Szrj
734*38fd1498Szrj if (nreg_moves == 0)
735*38fd1498Szrj continue;
736*38fd1498Szrj
737*38fd1498Szrj /* Create NREG_MOVES register moves. */
738*38fd1498Szrj first_move = ps->reg_moves.length ();
739*38fd1498Szrj ps->reg_moves.safe_grow_cleared (first_move + nreg_moves);
740*38fd1498Szrj extend_node_sched_params (ps);
741*38fd1498Szrj
742*38fd1498Szrj /* Record the moves associated with this node. */
743*38fd1498Szrj first_move += ps->g->num_nodes;
744*38fd1498Szrj
745*38fd1498Szrj /* Generate each move. */
746*38fd1498Szrj old_reg = prev_reg = SET_DEST (set);
747*38fd1498Szrj if (HARD_REGISTER_P (old_reg))
748*38fd1498Szrj return false;
749*38fd1498Szrj
750*38fd1498Szrj for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
751*38fd1498Szrj {
752*38fd1498Szrj ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
753*38fd1498Szrj
754*38fd1498Szrj move->def = i_reg_move > 0 ? first_move + i_reg_move - 1 : i;
755*38fd1498Szrj move->uses = sbitmap_alloc (first_move + nreg_moves);
756*38fd1498Szrj move->old_reg = old_reg;
757*38fd1498Szrj move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
758*38fd1498Szrj move->num_consecutive_stages = distances[0] && distances[1] ? 2 : 1;
759*38fd1498Szrj move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
760*38fd1498Szrj bitmap_clear (move->uses);
761*38fd1498Szrj
762*38fd1498Szrj prev_reg = move->new_reg;
763*38fd1498Szrj }
764*38fd1498Szrj
765*38fd1498Szrj distance1_uses = distances[1] ? sbitmap_alloc (g->num_nodes) : NULL;
766*38fd1498Szrj
767*38fd1498Szrj if (distance1_uses)
768*38fd1498Szrj bitmap_clear (distance1_uses);
769*38fd1498Szrj
770*38fd1498Szrj /* Every use of the register defined by node may require a different
771*38fd1498Szrj copy of this register, depending on the time the use is scheduled.
772*38fd1498Szrj Record which uses require which move results. */
773*38fd1498Szrj for (e = u->out; e; e = e->next_out)
774*38fd1498Szrj if (e->type == TRUE_DEP && e->dest != e->src)
775*38fd1498Szrj {
776*38fd1498Szrj int dest_copy = (SCHED_TIME (e->dest->cuid)
777*38fd1498Szrj - SCHED_TIME (e->src->cuid)) / ii;
778*38fd1498Szrj
779*38fd1498Szrj if (e->distance == 1)
780*38fd1498Szrj dest_copy = (SCHED_TIME (e->dest->cuid)
781*38fd1498Szrj - SCHED_TIME (e->src->cuid) + ii) / ii;
782*38fd1498Szrj
783*38fd1498Szrj if (SCHED_ROW (e->dest->cuid) == SCHED_ROW (e->src->cuid)
784*38fd1498Szrj && SCHED_COLUMN (e->dest->cuid) < SCHED_COLUMN (e->src->cuid))
785*38fd1498Szrj dest_copy--;
786*38fd1498Szrj
787*38fd1498Szrj if (dest_copy)
788*38fd1498Szrj {
789*38fd1498Szrj ps_reg_move_info *move;
790*38fd1498Szrj
791*38fd1498Szrj move = ps_reg_move (ps, first_move + dest_copy - 1);
792*38fd1498Szrj bitmap_set_bit (move->uses, e->dest->cuid);
793*38fd1498Szrj if (e->distance == 1)
794*38fd1498Szrj bitmap_set_bit (distance1_uses, e->dest->cuid);
795*38fd1498Szrj }
796*38fd1498Szrj }
797*38fd1498Szrj
798*38fd1498Szrj auto_sbitmap must_follow (first_move + nreg_moves);
799*38fd1498Szrj for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
800*38fd1498Szrj if (!schedule_reg_move (ps, first_move + i_reg_move,
801*38fd1498Szrj distance1_uses, must_follow))
802*38fd1498Szrj break;
803*38fd1498Szrj if (distance1_uses)
804*38fd1498Szrj sbitmap_free (distance1_uses);
805*38fd1498Szrj if (i_reg_move < nreg_moves)
806*38fd1498Szrj return false;
807*38fd1498Szrj }
808*38fd1498Szrj return true;
809*38fd1498Szrj }
810*38fd1498Szrj
811*38fd1498Szrj /* Emit the moves associated with PS. Apply the substitutions
812*38fd1498Szrj associated with them. */
813*38fd1498Szrj static void
apply_reg_moves(partial_schedule_ptr ps)814*38fd1498Szrj apply_reg_moves (partial_schedule_ptr ps)
815*38fd1498Szrj {
816*38fd1498Szrj ps_reg_move_info *move;
817*38fd1498Szrj int i;
818*38fd1498Szrj
819*38fd1498Szrj FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
820*38fd1498Szrj {
821*38fd1498Szrj unsigned int i_use;
822*38fd1498Szrj sbitmap_iterator sbi;
823*38fd1498Szrj
824*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (move->uses, 0, i_use, sbi)
825*38fd1498Szrj {
826*38fd1498Szrj replace_rtx (ps->g->nodes[i_use].insn, move->old_reg, move->new_reg);
827*38fd1498Szrj df_insn_rescan (ps->g->nodes[i_use].insn);
828*38fd1498Szrj }
829*38fd1498Szrj }
830*38fd1498Szrj }
831*38fd1498Szrj
832*38fd1498Szrj /* Bump the SCHED_TIMEs of all nodes by AMOUNT. Set the values of
833*38fd1498Szrj SCHED_ROW and SCHED_STAGE. Instruction scheduled on cycle AMOUNT
834*38fd1498Szrj will move to cycle zero. */
835*38fd1498Szrj static void
reset_sched_times(partial_schedule_ptr ps,int amount)836*38fd1498Szrj reset_sched_times (partial_schedule_ptr ps, int amount)
837*38fd1498Szrj {
838*38fd1498Szrj int row;
839*38fd1498Szrj int ii = ps->ii;
840*38fd1498Szrj ps_insn_ptr crr_insn;
841*38fd1498Szrj
842*38fd1498Szrj for (row = 0; row < ii; row++)
843*38fd1498Szrj for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
844*38fd1498Szrj {
845*38fd1498Szrj int u = crr_insn->id;
846*38fd1498Szrj int normalized_time = SCHED_TIME (u) - amount;
847*38fd1498Szrj int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
848*38fd1498Szrj
849*38fd1498Szrj if (dump_file)
850*38fd1498Szrj {
851*38fd1498Szrj /* Print the scheduling times after the rotation. */
852*38fd1498Szrj rtx_insn *insn = ps_rtl_insn (ps, u);
853*38fd1498Szrj
854*38fd1498Szrj fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
855*38fd1498Szrj "crr_insn->cycle=%d, min_cycle=%d", u,
856*38fd1498Szrj INSN_UID (insn), normalized_time, new_min_cycle);
857*38fd1498Szrj if (JUMP_P (insn))
858*38fd1498Szrj fprintf (dump_file, " (branch)");
859*38fd1498Szrj fprintf (dump_file, "\n");
860*38fd1498Szrj }
861*38fd1498Szrj
862*38fd1498Szrj gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
863*38fd1498Szrj gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
864*38fd1498Szrj
865*38fd1498Szrj crr_insn->cycle = normalized_time;
866*38fd1498Szrj update_node_sched_params (u, ii, normalized_time, new_min_cycle);
867*38fd1498Szrj }
868*38fd1498Szrj }
869*38fd1498Szrj
870*38fd1498Szrj /* Permute the insns according to their order in PS, from row 0 to
871*38fd1498Szrj row ii-1, and position them right before LAST. This schedules
872*38fd1498Szrj the insns of the loop kernel. */
873*38fd1498Szrj static void
permute_partial_schedule(partial_schedule_ptr ps,rtx_insn * last)874*38fd1498Szrj permute_partial_schedule (partial_schedule_ptr ps, rtx_insn *last)
875*38fd1498Szrj {
876*38fd1498Szrj int ii = ps->ii;
877*38fd1498Szrj int row;
878*38fd1498Szrj ps_insn_ptr ps_ij;
879*38fd1498Szrj
880*38fd1498Szrj for (row = 0; row < ii ; row++)
881*38fd1498Szrj for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
882*38fd1498Szrj {
883*38fd1498Szrj rtx_insn *insn = ps_rtl_insn (ps, ps_ij->id);
884*38fd1498Szrj
885*38fd1498Szrj if (PREV_INSN (last) != insn)
886*38fd1498Szrj {
887*38fd1498Szrj if (ps_ij->id < ps->g->num_nodes)
888*38fd1498Szrj reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
889*38fd1498Szrj PREV_INSN (last));
890*38fd1498Szrj else
891*38fd1498Szrj add_insn_before (insn, last, NULL);
892*38fd1498Szrj }
893*38fd1498Szrj }
894*38fd1498Szrj }
895*38fd1498Szrj
896*38fd1498Szrj /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
897*38fd1498Szrj respectively only if cycle C falls on the border of the scheduling
898*38fd1498Szrj window boundaries marked by START and END cycles. STEP is the
899*38fd1498Szrj direction of the window. */
900*38fd1498Szrj static inline void
set_must_precede_follow(sbitmap * tmp_follow,sbitmap must_follow,sbitmap * tmp_precede,sbitmap must_precede,int c,int start,int end,int step)901*38fd1498Szrj set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
902*38fd1498Szrj sbitmap *tmp_precede, sbitmap must_precede, int c,
903*38fd1498Szrj int start, int end, int step)
904*38fd1498Szrj {
905*38fd1498Szrj *tmp_precede = NULL;
906*38fd1498Szrj *tmp_follow = NULL;
907*38fd1498Szrj
908*38fd1498Szrj if (c == start)
909*38fd1498Szrj {
910*38fd1498Szrj if (step == 1)
911*38fd1498Szrj *tmp_precede = must_precede;
912*38fd1498Szrj else /* step == -1. */
913*38fd1498Szrj *tmp_follow = must_follow;
914*38fd1498Szrj }
915*38fd1498Szrj if (c == end - step)
916*38fd1498Szrj {
917*38fd1498Szrj if (step == 1)
918*38fd1498Szrj *tmp_follow = must_follow;
919*38fd1498Szrj else /* step == -1. */
920*38fd1498Szrj *tmp_precede = must_precede;
921*38fd1498Szrj }
922*38fd1498Szrj
923*38fd1498Szrj }
924*38fd1498Szrj
925*38fd1498Szrj /* Return True if the branch can be moved to row ii-1 while
926*38fd1498Szrj normalizing the partial schedule PS to start from cycle zero and thus
927*38fd1498Szrj optimize the SC. Otherwise return False. */
928*38fd1498Szrj static bool
optimize_sc(partial_schedule_ptr ps,ddg_ptr g)929*38fd1498Szrj optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
930*38fd1498Szrj {
931*38fd1498Szrj int amount = PS_MIN_CYCLE (ps);
932*38fd1498Szrj int start, end, step;
933*38fd1498Szrj int ii = ps->ii;
934*38fd1498Szrj bool ok = false;
935*38fd1498Szrj int stage_count, stage_count_curr;
936*38fd1498Szrj
937*38fd1498Szrj /* Compare the SC after normalization and SC after bringing the branch
938*38fd1498Szrj to row ii-1. If they are equal just bail out. */
939*38fd1498Szrj stage_count = calculate_stage_count (ps, amount);
940*38fd1498Szrj stage_count_curr =
941*38fd1498Szrj calculate_stage_count (ps, SCHED_TIME (g->closing_branch->cuid) - (ii - 1));
942*38fd1498Szrj
943*38fd1498Szrj if (stage_count == stage_count_curr)
944*38fd1498Szrj {
945*38fd1498Szrj if (dump_file)
946*38fd1498Szrj fprintf (dump_file, "SMS SC already optimized.\n");
947*38fd1498Szrj
948*38fd1498Szrj return false;
949*38fd1498Szrj }
950*38fd1498Szrj
951*38fd1498Szrj if (dump_file)
952*38fd1498Szrj {
953*38fd1498Szrj fprintf (dump_file, "SMS Trying to optimize branch location\n");
954*38fd1498Szrj fprintf (dump_file, "SMS partial schedule before trial:\n");
955*38fd1498Szrj print_partial_schedule (ps, dump_file);
956*38fd1498Szrj }
957*38fd1498Szrj
958*38fd1498Szrj /* First, normalize the partial scheduling. */
959*38fd1498Szrj reset_sched_times (ps, amount);
960*38fd1498Szrj rotate_partial_schedule (ps, amount);
961*38fd1498Szrj if (dump_file)
962*38fd1498Szrj {
963*38fd1498Szrj fprintf (dump_file,
964*38fd1498Szrj "SMS partial schedule after normalization (ii, %d, SC %d):\n",
965*38fd1498Szrj ii, stage_count);
966*38fd1498Szrj print_partial_schedule (ps, dump_file);
967*38fd1498Szrj }
968*38fd1498Szrj
969*38fd1498Szrj if (SMODULO (SCHED_TIME (g->closing_branch->cuid), ii) == ii - 1)
970*38fd1498Szrj return true;
971*38fd1498Szrj
972*38fd1498Szrj auto_sbitmap sched_nodes (g->num_nodes);
973*38fd1498Szrj bitmap_ones (sched_nodes);
974*38fd1498Szrj
975*38fd1498Szrj /* Calculate the new placement of the branch. It should be in row
976*38fd1498Szrj ii-1 and fall into it's scheduling window. */
977*38fd1498Szrj if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
978*38fd1498Szrj &step, &end) == 0)
979*38fd1498Szrj {
980*38fd1498Szrj bool success;
981*38fd1498Szrj ps_insn_ptr next_ps_i;
982*38fd1498Szrj int branch_cycle = SCHED_TIME (g->closing_branch->cuid);
983*38fd1498Szrj int row = SMODULO (branch_cycle, ps->ii);
984*38fd1498Szrj int num_splits = 0;
985*38fd1498Szrj sbitmap tmp_precede, tmp_follow;
986*38fd1498Szrj int min_cycle, c;
987*38fd1498Szrj
988*38fd1498Szrj if (dump_file)
989*38fd1498Szrj fprintf (dump_file, "\nTrying to schedule node %d "
990*38fd1498Szrj "INSN = %d in (%d .. %d) step %d\n",
991*38fd1498Szrj g->closing_branch->cuid,
992*38fd1498Szrj (INSN_UID (g->closing_branch->insn)), start, end, step);
993*38fd1498Szrj
994*38fd1498Szrj gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
995*38fd1498Szrj if (step == 1)
996*38fd1498Szrj {
997*38fd1498Szrj c = start + ii - SMODULO (start, ii) - 1;
998*38fd1498Szrj gcc_assert (c >= start);
999*38fd1498Szrj if (c >= end)
1000*38fd1498Szrj {
1001*38fd1498Szrj if (dump_file)
1002*38fd1498Szrj fprintf (dump_file,
1003*38fd1498Szrj "SMS failed to schedule branch at cycle: %d\n", c);
1004*38fd1498Szrj return false;
1005*38fd1498Szrj }
1006*38fd1498Szrj }
1007*38fd1498Szrj else
1008*38fd1498Szrj {
1009*38fd1498Szrj c = start - SMODULO (start, ii) - 1;
1010*38fd1498Szrj gcc_assert (c <= start);
1011*38fd1498Szrj
1012*38fd1498Szrj if (c <= end)
1013*38fd1498Szrj {
1014*38fd1498Szrj if (dump_file)
1015*38fd1498Szrj fprintf (dump_file,
1016*38fd1498Szrj "SMS failed to schedule branch at cycle: %d\n", c);
1017*38fd1498Szrj return false;
1018*38fd1498Szrj }
1019*38fd1498Szrj }
1020*38fd1498Szrj
1021*38fd1498Szrj auto_sbitmap must_precede (g->num_nodes);
1022*38fd1498Szrj auto_sbitmap must_follow (g->num_nodes);
1023*38fd1498Szrj
1024*38fd1498Szrj /* Try to schedule the branch is it's new cycle. */
1025*38fd1498Szrj calculate_must_precede_follow (g->closing_branch, start, end,
1026*38fd1498Szrj step, ii, sched_nodes,
1027*38fd1498Szrj must_precede, must_follow);
1028*38fd1498Szrj
1029*38fd1498Szrj set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1030*38fd1498Szrj must_precede, c, start, end, step);
1031*38fd1498Szrj
1032*38fd1498Szrj /* Find the element in the partial schedule related to the closing
1033*38fd1498Szrj branch so we can remove it from it's current cycle. */
1034*38fd1498Szrj for (next_ps_i = ps->rows[row];
1035*38fd1498Szrj next_ps_i; next_ps_i = next_ps_i->next_in_row)
1036*38fd1498Szrj if (next_ps_i->id == g->closing_branch->cuid)
1037*38fd1498Szrj break;
1038*38fd1498Szrj
1039*38fd1498Szrj min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1040*38fd1498Szrj remove_node_from_ps (ps, next_ps_i);
1041*38fd1498Szrj success =
1042*38fd1498Szrj try_scheduling_node_in_cycle (ps, g->closing_branch->cuid, c,
1043*38fd1498Szrj sched_nodes, &num_splits,
1044*38fd1498Szrj tmp_precede, tmp_follow);
1045*38fd1498Szrj gcc_assert (num_splits == 0);
1046*38fd1498Szrj if (!success)
1047*38fd1498Szrj {
1048*38fd1498Szrj if (dump_file)
1049*38fd1498Szrj fprintf (dump_file,
1050*38fd1498Szrj "SMS failed to schedule branch at cycle: %d, "
1051*38fd1498Szrj "bringing it back to cycle %d\n", c, branch_cycle);
1052*38fd1498Szrj
1053*38fd1498Szrj /* The branch was failed to be placed in row ii - 1.
1054*38fd1498Szrj Put it back in it's original place in the partial
1055*38fd1498Szrj schedualing. */
1056*38fd1498Szrj set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
1057*38fd1498Szrj must_precede, branch_cycle, start, end,
1058*38fd1498Szrj step);
1059*38fd1498Szrj success =
1060*38fd1498Szrj try_scheduling_node_in_cycle (ps, g->closing_branch->cuid,
1061*38fd1498Szrj branch_cycle, sched_nodes,
1062*38fd1498Szrj &num_splits, tmp_precede,
1063*38fd1498Szrj tmp_follow);
1064*38fd1498Szrj gcc_assert (success && (num_splits == 0));
1065*38fd1498Szrj ok = false;
1066*38fd1498Szrj }
1067*38fd1498Szrj else
1068*38fd1498Szrj {
1069*38fd1498Szrj /* The branch is placed in row ii - 1. */
1070*38fd1498Szrj if (dump_file)
1071*38fd1498Szrj fprintf (dump_file,
1072*38fd1498Szrj "SMS success in moving branch to cycle %d\n", c);
1073*38fd1498Szrj
1074*38fd1498Szrj update_node_sched_params (g->closing_branch->cuid, ii, c,
1075*38fd1498Szrj PS_MIN_CYCLE (ps));
1076*38fd1498Szrj ok = true;
1077*38fd1498Szrj }
1078*38fd1498Szrj
1079*38fd1498Szrj /* This might have been added to a new first stage. */
1080*38fd1498Szrj if (PS_MIN_CYCLE (ps) < min_cycle)
1081*38fd1498Szrj reset_sched_times (ps, 0);
1082*38fd1498Szrj }
1083*38fd1498Szrj
1084*38fd1498Szrj return ok;
1085*38fd1498Szrj }
1086*38fd1498Szrj
1087*38fd1498Szrj static void
duplicate_insns_of_cycles(partial_schedule_ptr ps,int from_stage,int to_stage,rtx count_reg)1088*38fd1498Szrj duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
1089*38fd1498Szrj int to_stage, rtx count_reg)
1090*38fd1498Szrj {
1091*38fd1498Szrj int row;
1092*38fd1498Szrj ps_insn_ptr ps_ij;
1093*38fd1498Szrj
1094*38fd1498Szrj for (row = 0; row < ps->ii; row++)
1095*38fd1498Szrj for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
1096*38fd1498Szrj {
1097*38fd1498Szrj int u = ps_ij->id;
1098*38fd1498Szrj int first_u, last_u;
1099*38fd1498Szrj rtx_insn *u_insn;
1100*38fd1498Szrj
1101*38fd1498Szrj /* Do not duplicate any insn which refers to count_reg as it
1102*38fd1498Szrj belongs to the control part.
1103*38fd1498Szrj The closing branch is scheduled as well and thus should
1104*38fd1498Szrj be ignored.
1105*38fd1498Szrj TODO: This should be done by analyzing the control part of
1106*38fd1498Szrj the loop. */
1107*38fd1498Szrj u_insn = ps_rtl_insn (ps, u);
1108*38fd1498Szrj if (reg_mentioned_p (count_reg, u_insn)
1109*38fd1498Szrj || JUMP_P (u_insn))
1110*38fd1498Szrj continue;
1111*38fd1498Szrj
1112*38fd1498Szrj first_u = SCHED_STAGE (u);
1113*38fd1498Szrj last_u = first_u + ps_num_consecutive_stages (ps, u) - 1;
1114*38fd1498Szrj if (from_stage <= last_u && to_stage >= first_u)
1115*38fd1498Szrj {
1116*38fd1498Szrj if (u < ps->g->num_nodes)
1117*38fd1498Szrj duplicate_insn_chain (ps_first_note (ps, u), u_insn);
1118*38fd1498Szrj else
1119*38fd1498Szrj emit_insn (copy_rtx (PATTERN (u_insn)));
1120*38fd1498Szrj }
1121*38fd1498Szrj }
1122*38fd1498Szrj }
1123*38fd1498Szrj
1124*38fd1498Szrj
1125*38fd1498Szrj /* Generate the instructions (including reg_moves) for prolog & epilog. */
1126*38fd1498Szrj static void
generate_prolog_epilog(partial_schedule_ptr ps,struct loop * loop,rtx count_reg,rtx count_init)1127*38fd1498Szrj generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
1128*38fd1498Szrj rtx count_reg, rtx count_init)
1129*38fd1498Szrj {
1130*38fd1498Szrj int i;
1131*38fd1498Szrj int last_stage = PS_STAGE_COUNT (ps) - 1;
1132*38fd1498Szrj edge e;
1133*38fd1498Szrj
1134*38fd1498Szrj /* Generate the prolog, inserting its insns on the loop-entry edge. */
1135*38fd1498Szrj start_sequence ();
1136*38fd1498Szrj
1137*38fd1498Szrj if (!count_init)
1138*38fd1498Szrj {
1139*38fd1498Szrj /* Generate instructions at the beginning of the prolog to
1140*38fd1498Szrj adjust the loop count by STAGE_COUNT. If loop count is constant
1141*38fd1498Szrj (count_init), this constant is adjusted by STAGE_COUNT in
1142*38fd1498Szrj generate_prolog_epilog function. */
1143*38fd1498Szrj rtx sub_reg = NULL_RTX;
1144*38fd1498Szrj
1145*38fd1498Szrj sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
1146*38fd1498Szrj gen_int_mode (last_stage,
1147*38fd1498Szrj GET_MODE (count_reg)),
1148*38fd1498Szrj count_reg, 1, OPTAB_DIRECT);
1149*38fd1498Szrj gcc_assert (REG_P (sub_reg));
1150*38fd1498Szrj if (REGNO (sub_reg) != REGNO (count_reg))
1151*38fd1498Szrj emit_move_insn (count_reg, sub_reg);
1152*38fd1498Szrj }
1153*38fd1498Szrj
1154*38fd1498Szrj for (i = 0; i < last_stage; i++)
1155*38fd1498Szrj duplicate_insns_of_cycles (ps, 0, i, count_reg);
1156*38fd1498Szrj
1157*38fd1498Szrj /* Put the prolog on the entry edge. */
1158*38fd1498Szrj e = loop_preheader_edge (loop);
1159*38fd1498Szrj split_edge_and_insert (e, get_insns ());
1160*38fd1498Szrj if (!flag_resched_modulo_sched)
1161*38fd1498Szrj e->dest->flags |= BB_DISABLE_SCHEDULE;
1162*38fd1498Szrj
1163*38fd1498Szrj end_sequence ();
1164*38fd1498Szrj
1165*38fd1498Szrj /* Generate the epilog, inserting its insns on the loop-exit edge. */
1166*38fd1498Szrj start_sequence ();
1167*38fd1498Szrj
1168*38fd1498Szrj for (i = 0; i < last_stage; i++)
1169*38fd1498Szrj duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
1170*38fd1498Szrj
1171*38fd1498Szrj /* Put the epilogue on the exit edge. */
1172*38fd1498Szrj gcc_assert (single_exit (loop));
1173*38fd1498Szrj e = single_exit (loop);
1174*38fd1498Szrj split_edge_and_insert (e, get_insns ());
1175*38fd1498Szrj if (!flag_resched_modulo_sched)
1176*38fd1498Szrj e->dest->flags |= BB_DISABLE_SCHEDULE;
1177*38fd1498Szrj
1178*38fd1498Szrj end_sequence ();
1179*38fd1498Szrj }
1180*38fd1498Szrj
1181*38fd1498Szrj /* Mark LOOP as software pipelined so the later
1182*38fd1498Szrj scheduling passes don't touch it. */
1183*38fd1498Szrj static void
mark_loop_unsched(struct loop * loop)1184*38fd1498Szrj mark_loop_unsched (struct loop *loop)
1185*38fd1498Szrj {
1186*38fd1498Szrj unsigned i;
1187*38fd1498Szrj basic_block *bbs = get_loop_body (loop);
1188*38fd1498Szrj
1189*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++)
1190*38fd1498Szrj bbs[i]->flags |= BB_DISABLE_SCHEDULE;
1191*38fd1498Szrj
1192*38fd1498Szrj free (bbs);
1193*38fd1498Szrj }
1194*38fd1498Szrj
1195*38fd1498Szrj /* Return true if all the BBs of the loop are empty except the
1196*38fd1498Szrj loop header. */
1197*38fd1498Szrj static bool
loop_single_full_bb_p(struct loop * loop)1198*38fd1498Szrj loop_single_full_bb_p (struct loop *loop)
1199*38fd1498Szrj {
1200*38fd1498Szrj unsigned i;
1201*38fd1498Szrj basic_block *bbs = get_loop_body (loop);
1202*38fd1498Szrj
1203*38fd1498Szrj for (i = 0; i < loop->num_nodes ; i++)
1204*38fd1498Szrj {
1205*38fd1498Szrj rtx_insn *head, *tail;
1206*38fd1498Szrj bool empty_bb = true;
1207*38fd1498Szrj
1208*38fd1498Szrj if (bbs[i] == loop->header)
1209*38fd1498Szrj continue;
1210*38fd1498Szrj
1211*38fd1498Szrj /* Make sure that basic blocks other than the header
1212*38fd1498Szrj have only notes labels or jumps. */
1213*38fd1498Szrj get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1214*38fd1498Szrj for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1215*38fd1498Szrj {
1216*38fd1498Szrj if (NOTE_P (head) || LABEL_P (head)
1217*38fd1498Szrj || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1218*38fd1498Szrj continue;
1219*38fd1498Szrj empty_bb = false;
1220*38fd1498Szrj break;
1221*38fd1498Szrj }
1222*38fd1498Szrj
1223*38fd1498Szrj if (! empty_bb)
1224*38fd1498Szrj {
1225*38fd1498Szrj free (bbs);
1226*38fd1498Szrj return false;
1227*38fd1498Szrj }
1228*38fd1498Szrj }
1229*38fd1498Szrj free (bbs);
1230*38fd1498Szrj return true;
1231*38fd1498Szrj }
1232*38fd1498Szrj
1233*38fd1498Szrj /* Dump file:line from INSN's location info to dump_file. */
1234*38fd1498Szrj
1235*38fd1498Szrj static void
dump_insn_location(rtx_insn * insn)1236*38fd1498Szrj dump_insn_location (rtx_insn *insn)
1237*38fd1498Szrj {
1238*38fd1498Szrj if (dump_file && INSN_HAS_LOCATION (insn))
1239*38fd1498Szrj {
1240*38fd1498Szrj expanded_location xloc = insn_location (insn);
1241*38fd1498Szrj fprintf (dump_file, " %s:%i", xloc.file, xloc.line);
1242*38fd1498Szrj }
1243*38fd1498Szrj }
1244*38fd1498Szrj
1245*38fd1498Szrj /* A simple loop from SMS point of view; it is a loop that is composed of
1246*38fd1498Szrj either a single basic block or two BBs - a header and a latch. */
1247*38fd1498Szrj #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
1248*38fd1498Szrj && (EDGE_COUNT (loop->latch->preds) == 1) \
1249*38fd1498Szrj && (EDGE_COUNT (loop->latch->succs) == 1))
1250*38fd1498Szrj
1251*38fd1498Szrj /* Return true if the loop is in its canonical form and false if not.
1252*38fd1498Szrj i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
1253*38fd1498Szrj static bool
loop_canon_p(struct loop * loop)1254*38fd1498Szrj loop_canon_p (struct loop *loop)
1255*38fd1498Szrj {
1256*38fd1498Szrj
1257*38fd1498Szrj if (loop->inner || !loop_outer (loop))
1258*38fd1498Szrj {
1259*38fd1498Szrj if (dump_file)
1260*38fd1498Szrj fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1261*38fd1498Szrj return false;
1262*38fd1498Szrj }
1263*38fd1498Szrj
1264*38fd1498Szrj if (!single_exit (loop))
1265*38fd1498Szrj {
1266*38fd1498Szrj if (dump_file)
1267*38fd1498Szrj {
1268*38fd1498Szrj rtx_insn *insn = BB_END (loop->header);
1269*38fd1498Szrj
1270*38fd1498Szrj fprintf (dump_file, "SMS loop many exits");
1271*38fd1498Szrj dump_insn_location (insn);
1272*38fd1498Szrj fprintf (dump_file, "\n");
1273*38fd1498Szrj }
1274*38fd1498Szrj return false;
1275*38fd1498Szrj }
1276*38fd1498Szrj
1277*38fd1498Szrj if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1278*38fd1498Szrj {
1279*38fd1498Szrj if (dump_file)
1280*38fd1498Szrj {
1281*38fd1498Szrj rtx_insn *insn = BB_END (loop->header);
1282*38fd1498Szrj
1283*38fd1498Szrj fprintf (dump_file, "SMS loop many BBs.");
1284*38fd1498Szrj dump_insn_location (insn);
1285*38fd1498Szrj fprintf (dump_file, "\n");
1286*38fd1498Szrj }
1287*38fd1498Szrj return false;
1288*38fd1498Szrj }
1289*38fd1498Szrj
1290*38fd1498Szrj return true;
1291*38fd1498Szrj }
1292*38fd1498Szrj
1293*38fd1498Szrj /* If there are more than one entry for the loop,
1294*38fd1498Szrj make it one by splitting the first entry edge and
1295*38fd1498Szrj redirecting the others to the new BB. */
1296*38fd1498Szrj static void
canon_loop(struct loop * loop)1297*38fd1498Szrj canon_loop (struct loop *loop)
1298*38fd1498Szrj {
1299*38fd1498Szrj edge e;
1300*38fd1498Szrj edge_iterator i;
1301*38fd1498Szrj
1302*38fd1498Szrj /* Avoid annoying special cases of edges going to exit
1303*38fd1498Szrj block. */
1304*38fd1498Szrj FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
1305*38fd1498Szrj if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1306*38fd1498Szrj split_edge (e);
1307*38fd1498Szrj
1308*38fd1498Szrj if (loop->latch == loop->header
1309*38fd1498Szrj || EDGE_COUNT (loop->latch->succs) > 1)
1310*38fd1498Szrj {
1311*38fd1498Szrj FOR_EACH_EDGE (e, i, loop->header->preds)
1312*38fd1498Szrj if (e->src == loop->latch)
1313*38fd1498Szrj break;
1314*38fd1498Szrj split_edge (e);
1315*38fd1498Szrj }
1316*38fd1498Szrj }
1317*38fd1498Szrj
1318*38fd1498Szrj /* Setup infos. */
1319*38fd1498Szrj static void
setup_sched_infos(void)1320*38fd1498Szrj setup_sched_infos (void)
1321*38fd1498Szrj {
1322*38fd1498Szrj memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1323*38fd1498Szrj sizeof (sms_common_sched_info));
1324*38fd1498Szrj sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1325*38fd1498Szrj common_sched_info = &sms_common_sched_info;
1326*38fd1498Szrj
1327*38fd1498Szrj sched_deps_info = &sms_sched_deps_info;
1328*38fd1498Szrj current_sched_info = &sms_sched_info;
1329*38fd1498Szrj }
1330*38fd1498Szrj
1331*38fd1498Szrj /* Probability in % that the sms-ed loop rolls enough so that optimized
1332*38fd1498Szrj version may be entered. Just a guess. */
1333*38fd1498Szrj #define PROB_SMS_ENOUGH_ITERATIONS 80
1334*38fd1498Szrj
1335*38fd1498Szrj /* Used to calculate the upper bound of ii. */
1336*38fd1498Szrj #define MAXII_FACTOR 2
1337*38fd1498Szrj
1338*38fd1498Szrj /* Main entry point, perform SMS scheduling on the loops of the function
1339*38fd1498Szrj that consist of single basic blocks. */
1340*38fd1498Szrj static void
sms_schedule(void)1341*38fd1498Szrj sms_schedule (void)
1342*38fd1498Szrj {
1343*38fd1498Szrj rtx_insn *insn;
1344*38fd1498Szrj ddg_ptr *g_arr, g;
1345*38fd1498Szrj int * node_order;
1346*38fd1498Szrj int maxii, max_asap;
1347*38fd1498Szrj partial_schedule_ptr ps;
1348*38fd1498Szrj basic_block bb = NULL;
1349*38fd1498Szrj struct loop *loop;
1350*38fd1498Szrj basic_block condition_bb = NULL;
1351*38fd1498Szrj edge latch_edge;
1352*38fd1498Szrj HOST_WIDE_INT trip_count, max_trip_count;
1353*38fd1498Szrj
1354*38fd1498Szrj loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1355*38fd1498Szrj | LOOPS_HAVE_RECORDED_EXITS);
1356*38fd1498Szrj if (number_of_loops (cfun) <= 1)
1357*38fd1498Szrj {
1358*38fd1498Szrj loop_optimizer_finalize ();
1359*38fd1498Szrj return; /* There are no loops to schedule. */
1360*38fd1498Szrj }
1361*38fd1498Szrj
1362*38fd1498Szrj /* Initialize issue_rate. */
1363*38fd1498Szrj if (targetm.sched.issue_rate)
1364*38fd1498Szrj {
1365*38fd1498Szrj int temp = reload_completed;
1366*38fd1498Szrj
1367*38fd1498Szrj reload_completed = 1;
1368*38fd1498Szrj issue_rate = targetm.sched.issue_rate ();
1369*38fd1498Szrj reload_completed = temp;
1370*38fd1498Szrj }
1371*38fd1498Szrj else
1372*38fd1498Szrj issue_rate = 1;
1373*38fd1498Szrj
1374*38fd1498Szrj /* Initialize the scheduler. */
1375*38fd1498Szrj setup_sched_infos ();
1376*38fd1498Szrj haifa_sched_init ();
1377*38fd1498Szrj
1378*38fd1498Szrj /* Allocate memory to hold the DDG array one entry for each loop.
1379*38fd1498Szrj We use loop->num as index into this array. */
1380*38fd1498Szrj g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun));
1381*38fd1498Szrj
1382*38fd1498Szrj if (dump_file)
1383*38fd1498Szrj {
1384*38fd1498Szrj fprintf (dump_file, "\n\nSMS analysis phase\n");
1385*38fd1498Szrj fprintf (dump_file, "===================\n\n");
1386*38fd1498Szrj }
1387*38fd1498Szrj
1388*38fd1498Szrj /* Build DDGs for all the relevant loops and hold them in G_ARR
1389*38fd1498Szrj indexed by the loop index. */
1390*38fd1498Szrj FOR_EACH_LOOP (loop, 0)
1391*38fd1498Szrj {
1392*38fd1498Szrj rtx_insn *head, *tail;
1393*38fd1498Szrj rtx count_reg;
1394*38fd1498Szrj
1395*38fd1498Szrj /* For debugging. */
1396*38fd1498Szrj if (dbg_cnt (sms_sched_loop) == false)
1397*38fd1498Szrj {
1398*38fd1498Szrj if (dump_file)
1399*38fd1498Szrj fprintf (dump_file, "SMS reached max limit... \n");
1400*38fd1498Szrj
1401*38fd1498Szrj break;
1402*38fd1498Szrj }
1403*38fd1498Szrj
1404*38fd1498Szrj if (dump_file)
1405*38fd1498Szrj {
1406*38fd1498Szrj rtx_insn *insn = BB_END (loop->header);
1407*38fd1498Szrj
1408*38fd1498Szrj fprintf (dump_file, "SMS loop num: %d", loop->num);
1409*38fd1498Szrj dump_insn_location (insn);
1410*38fd1498Szrj fprintf (dump_file, "\n");
1411*38fd1498Szrj }
1412*38fd1498Szrj
1413*38fd1498Szrj if (! loop_canon_p (loop))
1414*38fd1498Szrj continue;
1415*38fd1498Szrj
1416*38fd1498Szrj if (! loop_single_full_bb_p (loop))
1417*38fd1498Szrj {
1418*38fd1498Szrj if (dump_file)
1419*38fd1498Szrj fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1420*38fd1498Szrj continue;
1421*38fd1498Szrj }
1422*38fd1498Szrj
1423*38fd1498Szrj bb = loop->header;
1424*38fd1498Szrj
1425*38fd1498Szrj get_ebb_head_tail (bb, bb, &head, &tail);
1426*38fd1498Szrj latch_edge = loop_latch_edge (loop);
1427*38fd1498Szrj gcc_assert (single_exit (loop));
1428*38fd1498Szrj trip_count = get_estimated_loop_iterations_int (loop);
1429*38fd1498Szrj max_trip_count = get_max_loop_iterations_int (loop);
1430*38fd1498Szrj
1431*38fd1498Szrj /* Perform SMS only on loops that their average count is above threshold. */
1432*38fd1498Szrj
1433*38fd1498Szrj if ( latch_edge->count () > profile_count::zero ()
1434*38fd1498Szrj && (latch_edge->count()
1435*38fd1498Szrj < single_exit (loop)->count ().apply_scale
1436*38fd1498Szrj (SMS_LOOP_AVERAGE_COUNT_THRESHOLD, 1)))
1437*38fd1498Szrj {
1438*38fd1498Szrj if (dump_file)
1439*38fd1498Szrj {
1440*38fd1498Szrj dump_insn_location (tail);
1441*38fd1498Szrj fprintf (dump_file, "\nSMS single-bb-loop\n");
1442*38fd1498Szrj if (profile_info && flag_branch_probabilities)
1443*38fd1498Szrj {
1444*38fd1498Szrj fprintf (dump_file, "SMS loop-count ");
1445*38fd1498Szrj fprintf (dump_file, "%" PRId64,
1446*38fd1498Szrj (int64_t) bb->count.to_gcov_type ());
1447*38fd1498Szrj fprintf (dump_file, "\n");
1448*38fd1498Szrj fprintf (dump_file, "SMS trip-count ");
1449*38fd1498Szrj fprintf (dump_file, "%" PRId64 "max %" PRId64,
1450*38fd1498Szrj (int64_t) trip_count, (int64_t) max_trip_count);
1451*38fd1498Szrj fprintf (dump_file, "\n");
1452*38fd1498Szrj fprintf (dump_file, "SMS profile-sum-max ");
1453*38fd1498Szrj fprintf (dump_file, "%" PRId64,
1454*38fd1498Szrj (int64_t) profile_info->sum_max);
1455*38fd1498Szrj fprintf (dump_file, "\n");
1456*38fd1498Szrj }
1457*38fd1498Szrj }
1458*38fd1498Szrj continue;
1459*38fd1498Szrj }
1460*38fd1498Szrj
1461*38fd1498Szrj /* Make sure this is a doloop. */
1462*38fd1498Szrj if ( !(count_reg = doloop_register_get (head, tail)))
1463*38fd1498Szrj {
1464*38fd1498Szrj if (dump_file)
1465*38fd1498Szrj fprintf (dump_file, "SMS doloop_register_get failed\n");
1466*38fd1498Szrj continue;
1467*38fd1498Szrj }
1468*38fd1498Szrj
1469*38fd1498Szrj /* Don't handle BBs with calls or barriers
1470*38fd1498Szrj or !single_set with the exception of instructions that include
1471*38fd1498Szrj count_reg---these instructions are part of the control part
1472*38fd1498Szrj that do-loop recognizes.
1473*38fd1498Szrj ??? Should handle insns defining subregs. */
1474*38fd1498Szrj for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1475*38fd1498Szrj {
1476*38fd1498Szrj rtx set;
1477*38fd1498Szrj
1478*38fd1498Szrj if (CALL_P (insn)
1479*38fd1498Szrj || BARRIER_P (insn)
1480*38fd1498Szrj || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1481*38fd1498Szrj && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1482*38fd1498Szrj && !reg_mentioned_p (count_reg, insn))
1483*38fd1498Szrj || (INSN_P (insn) && (set = single_set (insn))
1484*38fd1498Szrj && GET_CODE (SET_DEST (set)) == SUBREG))
1485*38fd1498Szrj break;
1486*38fd1498Szrj }
1487*38fd1498Szrj
1488*38fd1498Szrj if (insn != NEXT_INSN (tail))
1489*38fd1498Szrj {
1490*38fd1498Szrj if (dump_file)
1491*38fd1498Szrj {
1492*38fd1498Szrj if (CALL_P (insn))
1493*38fd1498Szrj fprintf (dump_file, "SMS loop-with-call\n");
1494*38fd1498Szrj else if (BARRIER_P (insn))
1495*38fd1498Szrj fprintf (dump_file, "SMS loop-with-barrier\n");
1496*38fd1498Szrj else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1497*38fd1498Szrj && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1498*38fd1498Szrj fprintf (dump_file, "SMS loop-with-not-single-set\n");
1499*38fd1498Szrj else
1500*38fd1498Szrj fprintf (dump_file, "SMS loop with subreg in lhs\n");
1501*38fd1498Szrj print_rtl_single (dump_file, insn);
1502*38fd1498Szrj }
1503*38fd1498Szrj
1504*38fd1498Szrj continue;
1505*38fd1498Szrj }
1506*38fd1498Szrj
1507*38fd1498Szrj /* Always schedule the closing branch with the rest of the
1508*38fd1498Szrj instructions. The branch is rotated to be in row ii-1 at the
1509*38fd1498Szrj end of the scheduling procedure to make sure it's the last
1510*38fd1498Szrj instruction in the iteration. */
1511*38fd1498Szrj if (! (g = create_ddg (bb, 1)))
1512*38fd1498Szrj {
1513*38fd1498Szrj if (dump_file)
1514*38fd1498Szrj fprintf (dump_file, "SMS create_ddg failed\n");
1515*38fd1498Szrj continue;
1516*38fd1498Szrj }
1517*38fd1498Szrj
1518*38fd1498Szrj g_arr[loop->num] = g;
1519*38fd1498Szrj if (dump_file)
1520*38fd1498Szrj fprintf (dump_file, "...OK\n");
1521*38fd1498Szrj
1522*38fd1498Szrj }
1523*38fd1498Szrj if (dump_file)
1524*38fd1498Szrj {
1525*38fd1498Szrj fprintf (dump_file, "\nSMS transformation phase\n");
1526*38fd1498Szrj fprintf (dump_file, "=========================\n\n");
1527*38fd1498Szrj }
1528*38fd1498Szrj
1529*38fd1498Szrj /* We don't want to perform SMS on new loops - created by versioning. */
1530*38fd1498Szrj FOR_EACH_LOOP (loop, 0)
1531*38fd1498Szrj {
1532*38fd1498Szrj rtx_insn *head, *tail;
1533*38fd1498Szrj rtx count_reg;
1534*38fd1498Szrj rtx_insn *count_init;
1535*38fd1498Szrj int mii, rec_mii, stage_count, min_cycle;
1536*38fd1498Szrj int64_t loop_count = 0;
1537*38fd1498Szrj bool opt_sc_p;
1538*38fd1498Szrj
1539*38fd1498Szrj if (! (g = g_arr[loop->num]))
1540*38fd1498Szrj continue;
1541*38fd1498Szrj
1542*38fd1498Szrj if (dump_file)
1543*38fd1498Szrj {
1544*38fd1498Szrj rtx_insn *insn = BB_END (loop->header);
1545*38fd1498Szrj
1546*38fd1498Szrj fprintf (dump_file, "SMS loop num: %d", loop->num);
1547*38fd1498Szrj dump_insn_location (insn);
1548*38fd1498Szrj fprintf (dump_file, "\n");
1549*38fd1498Szrj
1550*38fd1498Szrj print_ddg (dump_file, g);
1551*38fd1498Szrj }
1552*38fd1498Szrj
1553*38fd1498Szrj get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1554*38fd1498Szrj
1555*38fd1498Szrj latch_edge = loop_latch_edge (loop);
1556*38fd1498Szrj gcc_assert (single_exit (loop));
1557*38fd1498Szrj trip_count = get_estimated_loop_iterations_int (loop);
1558*38fd1498Szrj max_trip_count = get_max_loop_iterations_int (loop);
1559*38fd1498Szrj
1560*38fd1498Szrj if (dump_file)
1561*38fd1498Szrj {
1562*38fd1498Szrj dump_insn_location (tail);
1563*38fd1498Szrj fprintf (dump_file, "\nSMS single-bb-loop\n");
1564*38fd1498Szrj if (profile_info && flag_branch_probabilities)
1565*38fd1498Szrj {
1566*38fd1498Szrj fprintf (dump_file, "SMS loop-count ");
1567*38fd1498Szrj fprintf (dump_file, "%" PRId64,
1568*38fd1498Szrj (int64_t) bb->count.to_gcov_type ());
1569*38fd1498Szrj fprintf (dump_file, "\n");
1570*38fd1498Szrj fprintf (dump_file, "SMS profile-sum-max ");
1571*38fd1498Szrj fprintf (dump_file, "%" PRId64,
1572*38fd1498Szrj (int64_t) profile_info->sum_max);
1573*38fd1498Szrj fprintf (dump_file, "\n");
1574*38fd1498Szrj }
1575*38fd1498Szrj fprintf (dump_file, "SMS doloop\n");
1576*38fd1498Szrj fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1577*38fd1498Szrj fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1578*38fd1498Szrj fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1579*38fd1498Szrj }
1580*38fd1498Szrj
1581*38fd1498Szrj
1582*38fd1498Szrj /* In case of th loop have doloop register it gets special
1583*38fd1498Szrj handling. */
1584*38fd1498Szrj count_init = NULL;
1585*38fd1498Szrj if ((count_reg = doloop_register_get (head, tail)))
1586*38fd1498Szrj {
1587*38fd1498Szrj basic_block pre_header;
1588*38fd1498Szrj
1589*38fd1498Szrj pre_header = loop_preheader_edge (loop)->src;
1590*38fd1498Szrj count_init = const_iteration_count (count_reg, pre_header,
1591*38fd1498Szrj &loop_count);
1592*38fd1498Szrj }
1593*38fd1498Szrj gcc_assert (count_reg);
1594*38fd1498Szrj
1595*38fd1498Szrj if (dump_file && count_init)
1596*38fd1498Szrj {
1597*38fd1498Szrj fprintf (dump_file, "SMS const-doloop ");
1598*38fd1498Szrj fprintf (dump_file, "%" PRId64,
1599*38fd1498Szrj loop_count);
1600*38fd1498Szrj fprintf (dump_file, "\n");
1601*38fd1498Szrj }
1602*38fd1498Szrj
1603*38fd1498Szrj node_order = XNEWVEC (int, g->num_nodes);
1604*38fd1498Szrj
1605*38fd1498Szrj mii = 1; /* Need to pass some estimate of mii. */
1606*38fd1498Szrj rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1607*38fd1498Szrj mii = MAX (res_MII (g), rec_mii);
1608*38fd1498Szrj maxii = MAX (max_asap, MAXII_FACTOR * mii);
1609*38fd1498Szrj
1610*38fd1498Szrj if (dump_file)
1611*38fd1498Szrj fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1612*38fd1498Szrj rec_mii, mii, maxii);
1613*38fd1498Szrj
1614*38fd1498Szrj for (;;)
1615*38fd1498Szrj {
1616*38fd1498Szrj set_node_sched_params (g);
1617*38fd1498Szrj
1618*38fd1498Szrj stage_count = 0;
1619*38fd1498Szrj opt_sc_p = false;
1620*38fd1498Szrj ps = sms_schedule_by_order (g, mii, maxii, node_order);
1621*38fd1498Szrj
1622*38fd1498Szrj if (ps)
1623*38fd1498Szrj {
1624*38fd1498Szrj /* Try to achieve optimized SC by normalizing the partial
1625*38fd1498Szrj schedule (having the cycles start from cycle zero).
1626*38fd1498Szrj The branch location must be placed in row ii-1 in the
1627*38fd1498Szrj final scheduling. If failed, shift all instructions to
1628*38fd1498Szrj position the branch in row ii-1. */
1629*38fd1498Szrj opt_sc_p = optimize_sc (ps, g);
1630*38fd1498Szrj if (opt_sc_p)
1631*38fd1498Szrj stage_count = calculate_stage_count (ps, 0);
1632*38fd1498Szrj else
1633*38fd1498Szrj {
1634*38fd1498Szrj /* Bring the branch to cycle ii-1. */
1635*38fd1498Szrj int amount = (SCHED_TIME (g->closing_branch->cuid)
1636*38fd1498Szrj - (ps->ii - 1));
1637*38fd1498Szrj
1638*38fd1498Szrj if (dump_file)
1639*38fd1498Szrj fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1640*38fd1498Szrj
1641*38fd1498Szrj stage_count = calculate_stage_count (ps, amount);
1642*38fd1498Szrj }
1643*38fd1498Szrj
1644*38fd1498Szrj gcc_assert (stage_count >= 1);
1645*38fd1498Szrj }
1646*38fd1498Szrj
1647*38fd1498Szrj /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1648*38fd1498Szrj 1 means that there is no interleaving between iterations thus
1649*38fd1498Szrj we let the scheduling passes do the job in this case. */
1650*38fd1498Szrj if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
1651*38fd1498Szrj || (count_init && (loop_count <= stage_count))
1652*38fd1498Szrj || (max_trip_count >= 0 && max_trip_count <= stage_count)
1653*38fd1498Szrj || (trip_count >= 0 && trip_count <= stage_count))
1654*38fd1498Szrj {
1655*38fd1498Szrj if (dump_file)
1656*38fd1498Szrj {
1657*38fd1498Szrj fprintf (dump_file, "SMS failed... \n");
1658*38fd1498Szrj fprintf (dump_file, "SMS sched-failed (stage-count=%d,"
1659*38fd1498Szrj " loop-count=", stage_count);
1660*38fd1498Szrj fprintf (dump_file, "%" PRId64, loop_count);
1661*38fd1498Szrj fprintf (dump_file, ", trip-count=");
1662*38fd1498Szrj fprintf (dump_file, "%" PRId64 "max %" PRId64,
1663*38fd1498Szrj (int64_t) trip_count, (int64_t) max_trip_count);
1664*38fd1498Szrj fprintf (dump_file, ")\n");
1665*38fd1498Szrj }
1666*38fd1498Szrj break;
1667*38fd1498Szrj }
1668*38fd1498Szrj
1669*38fd1498Szrj if (!opt_sc_p)
1670*38fd1498Szrj {
1671*38fd1498Szrj /* Rotate the partial schedule to have the branch in row ii-1. */
1672*38fd1498Szrj int amount = SCHED_TIME (g->closing_branch->cuid) - (ps->ii - 1);
1673*38fd1498Szrj
1674*38fd1498Szrj reset_sched_times (ps, amount);
1675*38fd1498Szrj rotate_partial_schedule (ps, amount);
1676*38fd1498Szrj }
1677*38fd1498Szrj
1678*38fd1498Szrj set_columns_for_ps (ps);
1679*38fd1498Szrj
1680*38fd1498Szrj min_cycle = PS_MIN_CYCLE (ps) - SMODULO (PS_MIN_CYCLE (ps), ps->ii);
1681*38fd1498Szrj if (!schedule_reg_moves (ps))
1682*38fd1498Szrj {
1683*38fd1498Szrj mii = ps->ii + 1;
1684*38fd1498Szrj free_partial_schedule (ps);
1685*38fd1498Szrj continue;
1686*38fd1498Szrj }
1687*38fd1498Szrj
1688*38fd1498Szrj /* Moves that handle incoming values might have been added
1689*38fd1498Szrj to a new first stage. Bump the stage count if so.
1690*38fd1498Szrj
1691*38fd1498Szrj ??? Perhaps we could consider rotating the schedule here
1692*38fd1498Szrj instead? */
1693*38fd1498Szrj if (PS_MIN_CYCLE (ps) < min_cycle)
1694*38fd1498Szrj {
1695*38fd1498Szrj reset_sched_times (ps, 0);
1696*38fd1498Szrj stage_count++;
1697*38fd1498Szrj }
1698*38fd1498Szrj
1699*38fd1498Szrj /* The stage count should now be correct without rotation. */
1700*38fd1498Szrj gcc_checking_assert (stage_count == calculate_stage_count (ps, 0));
1701*38fd1498Szrj PS_STAGE_COUNT (ps) = stage_count;
1702*38fd1498Szrj
1703*38fd1498Szrj canon_loop (loop);
1704*38fd1498Szrj
1705*38fd1498Szrj if (dump_file)
1706*38fd1498Szrj {
1707*38fd1498Szrj dump_insn_location (tail);
1708*38fd1498Szrj fprintf (dump_file, " SMS succeeded %d %d (with ii, sc)\n",
1709*38fd1498Szrj ps->ii, stage_count);
1710*38fd1498Szrj print_partial_schedule (ps, dump_file);
1711*38fd1498Szrj }
1712*38fd1498Szrj
1713*38fd1498Szrj /* case the BCT count is not known , Do loop-versioning */
1714*38fd1498Szrj if (count_reg && ! count_init)
1715*38fd1498Szrj {
1716*38fd1498Szrj rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
1717*38fd1498Szrj gen_int_mode (stage_count,
1718*38fd1498Szrj GET_MODE (count_reg)));
1719*38fd1498Szrj profile_probability prob = profile_probability::guessed_always ()
1720*38fd1498Szrj .apply_scale (PROB_SMS_ENOUGH_ITERATIONS, 100);
1721*38fd1498Szrj
1722*38fd1498Szrj loop_version (loop, comp_rtx, &condition_bb,
1723*38fd1498Szrj prob, prob.invert (),
1724*38fd1498Szrj prob, prob.invert (), true);
1725*38fd1498Szrj }
1726*38fd1498Szrj
1727*38fd1498Szrj /* Set new iteration count of loop kernel. */
1728*38fd1498Szrj if (count_reg && count_init)
1729*38fd1498Szrj SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1730*38fd1498Szrj - stage_count + 1);
1731*38fd1498Szrj
1732*38fd1498Szrj /* Now apply the scheduled kernel to the RTL of the loop. */
1733*38fd1498Szrj permute_partial_schedule (ps, g->closing_branch->first_note);
1734*38fd1498Szrj
1735*38fd1498Szrj /* Mark this loop as software pipelined so the later
1736*38fd1498Szrj scheduling passes don't touch it. */
1737*38fd1498Szrj if (! flag_resched_modulo_sched)
1738*38fd1498Szrj mark_loop_unsched (loop);
1739*38fd1498Szrj
1740*38fd1498Szrj /* The life-info is not valid any more. */
1741*38fd1498Szrj df_set_bb_dirty (g->bb);
1742*38fd1498Szrj
1743*38fd1498Szrj apply_reg_moves (ps);
1744*38fd1498Szrj if (dump_file)
1745*38fd1498Szrj print_node_sched_params (dump_file, g->num_nodes, ps);
1746*38fd1498Szrj /* Generate prolog and epilog. */
1747*38fd1498Szrj generate_prolog_epilog (ps, loop, count_reg, count_init);
1748*38fd1498Szrj break;
1749*38fd1498Szrj }
1750*38fd1498Szrj
1751*38fd1498Szrj free_partial_schedule (ps);
1752*38fd1498Szrj node_sched_param_vec.release ();
1753*38fd1498Szrj free (node_order);
1754*38fd1498Szrj free_ddg (g);
1755*38fd1498Szrj }
1756*38fd1498Szrj
1757*38fd1498Szrj free (g_arr);
1758*38fd1498Szrj
1759*38fd1498Szrj /* Release scheduler data, needed until now because of DFA. */
1760*38fd1498Szrj haifa_sched_finish ();
1761*38fd1498Szrj loop_optimizer_finalize ();
1762*38fd1498Szrj }
1763*38fd1498Szrj
1764*38fd1498Szrj /* The SMS scheduling algorithm itself
1765*38fd1498Szrj -----------------------------------
1766*38fd1498Szrj Input: 'O' an ordered list of insns of a loop.
1767*38fd1498Szrj Output: A scheduling of the loop - kernel, prolog, and epilogue.
1768*38fd1498Szrj
1769*38fd1498Szrj 'Q' is the empty Set
1770*38fd1498Szrj 'PS' is the partial schedule; it holds the currently scheduled nodes with
1771*38fd1498Szrj their cycle/slot.
1772*38fd1498Szrj 'PSP' previously scheduled predecessors.
1773*38fd1498Szrj 'PSS' previously scheduled successors.
1774*38fd1498Szrj 't(u)' the cycle where u is scheduled.
1775*38fd1498Szrj 'l(u)' is the latency of u.
1776*38fd1498Szrj 'd(v,u)' is the dependence distance from v to u.
1777*38fd1498Szrj 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1778*38fd1498Szrj the node ordering phase.
1779*38fd1498Szrj 'check_hardware_resources_conflicts(u, PS, c)'
1780*38fd1498Szrj run a trace around cycle/slot through DFA model
1781*38fd1498Szrj to check resource conflicts involving instruction u
1782*38fd1498Szrj at cycle c given the partial schedule PS.
1783*38fd1498Szrj 'add_to_partial_schedule_at_time(u, PS, c)'
1784*38fd1498Szrj Add the node/instruction u to the partial schedule
1785*38fd1498Szrj PS at time c.
1786*38fd1498Szrj 'calculate_register_pressure(PS)'
1787*38fd1498Szrj Given a schedule of instructions, calculate the register
1788*38fd1498Szrj pressure it implies. One implementation could be the
1789*38fd1498Szrj maximum number of overlapping live ranges.
1790*38fd1498Szrj 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1791*38fd1498Szrj registers available in the hardware.
1792*38fd1498Szrj
1793*38fd1498Szrj 1. II = MII.
1794*38fd1498Szrj 2. PS = empty list
1795*38fd1498Szrj 3. for each node u in O in pre-computed order
1796*38fd1498Szrj 4. if (PSP(u) != Q && PSS(u) == Q) then
1797*38fd1498Szrj 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1798*38fd1498Szrj 6. start = Early_start; end = Early_start + II - 1; step = 1
1799*38fd1498Szrj 11. else if (PSP(u) == Q && PSS(u) != Q) then
1800*38fd1498Szrj 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1801*38fd1498Szrj 13. start = Late_start; end = Late_start - II + 1; step = -1
1802*38fd1498Szrj 14. else if (PSP(u) != Q && PSS(u) != Q) then
1803*38fd1498Szrj 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1804*38fd1498Szrj 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1805*38fd1498Szrj 17. start = Early_start;
1806*38fd1498Szrj 18. end = min(Early_start + II - 1 , Late_start);
1807*38fd1498Szrj 19. step = 1
1808*38fd1498Szrj 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1809*38fd1498Szrj 21. start = ASAP(u); end = start + II - 1; step = 1
1810*38fd1498Szrj 22. endif
1811*38fd1498Szrj
1812*38fd1498Szrj 23. success = false
1813*38fd1498Szrj 24. for (c = start ; c != end ; c += step)
1814*38fd1498Szrj 25. if check_hardware_resources_conflicts(u, PS, c) then
1815*38fd1498Szrj 26. add_to_partial_schedule_at_time(u, PS, c)
1816*38fd1498Szrj 27. success = true
1817*38fd1498Szrj 28. break
1818*38fd1498Szrj 29. endif
1819*38fd1498Szrj 30. endfor
1820*38fd1498Szrj 31. if (success == false) then
1821*38fd1498Szrj 32. II = II + 1
1822*38fd1498Szrj 33. if (II > maxII) then
1823*38fd1498Szrj 34. finish - failed to schedule
1824*38fd1498Szrj 35. endif
1825*38fd1498Szrj 36. goto 2.
1826*38fd1498Szrj 37. endif
1827*38fd1498Szrj 38. endfor
1828*38fd1498Szrj 39. if (calculate_register_pressure(PS) > maxRP) then
1829*38fd1498Szrj 40. goto 32.
1830*38fd1498Szrj 41. endif
1831*38fd1498Szrj 42. compute epilogue & prologue
1832*38fd1498Szrj 43. finish - succeeded to schedule
1833*38fd1498Szrj
1834*38fd1498Szrj ??? The algorithm restricts the scheduling window to II cycles.
1835*38fd1498Szrj In rare cases, it may be better to allow windows of II+1 cycles.
1836*38fd1498Szrj The window would then start and end on the same row, but with
1837*38fd1498Szrj different "must precede" and "must follow" requirements. */
1838*38fd1498Szrj
1839*38fd1498Szrj /* A limit on the number of cycles that resource conflicts can span. ??? Should
1840*38fd1498Szrj be provided by DFA, and be dependent on the type of insn scheduled. Currently
1841*38fd1498Szrj set to 0 to save compile time. */
1842*38fd1498Szrj #define DFA_HISTORY SMS_DFA_HISTORY
1843*38fd1498Szrj
1844*38fd1498Szrj /* A threshold for the number of repeated unsuccessful attempts to insert
1845*38fd1498Szrj an empty row, before we flush the partial schedule and start over. */
1846*38fd1498Szrj #define MAX_SPLIT_NUM 10
1847*38fd1498Szrj /* Given the partial schedule PS, this function calculates and returns the
1848*38fd1498Szrj cycles in which we can schedule the node with the given index I.
1849*38fd1498Szrj NOTE: Here we do the backtracking in SMS, in some special cases. We have
1850*38fd1498Szrj noticed that there are several cases in which we fail to SMS the loop
1851*38fd1498Szrj because the sched window of a node is empty due to tight data-deps. In
1852*38fd1498Szrj such cases we want to unschedule some of the predecessors/successors
1853*38fd1498Szrj until we get non-empty scheduling window. It returns -1 if the
1854*38fd1498Szrj scheduling window is empty and zero otherwise. */
1855*38fd1498Szrj
1856*38fd1498Szrj static int
get_sched_window(partial_schedule_ptr ps,ddg_node_ptr u_node,sbitmap sched_nodes,int ii,int * start_p,int * step_p,int * end_p)1857*38fd1498Szrj get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1858*38fd1498Szrj sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1859*38fd1498Szrj int *end_p)
1860*38fd1498Szrj {
1861*38fd1498Szrj int start, step, end;
1862*38fd1498Szrj int early_start, late_start;
1863*38fd1498Szrj ddg_edge_ptr e;
1864*38fd1498Szrj auto_sbitmap psp (ps->g->num_nodes);
1865*38fd1498Szrj auto_sbitmap pss (ps->g->num_nodes);
1866*38fd1498Szrj sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1867*38fd1498Szrj sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1868*38fd1498Szrj int psp_not_empty;
1869*38fd1498Szrj int pss_not_empty;
1870*38fd1498Szrj int count_preds;
1871*38fd1498Szrj int count_succs;
1872*38fd1498Szrj
1873*38fd1498Szrj /* 1. compute sched window for u (start, end, step). */
1874*38fd1498Szrj bitmap_clear (psp);
1875*38fd1498Szrj bitmap_clear (pss);
1876*38fd1498Szrj psp_not_empty = bitmap_and (psp, u_node_preds, sched_nodes);
1877*38fd1498Szrj pss_not_empty = bitmap_and (pss, u_node_succs, sched_nodes);
1878*38fd1498Szrj
1879*38fd1498Szrj /* We first compute a forward range (start <= end), then decide whether
1880*38fd1498Szrj to reverse it. */
1881*38fd1498Szrj early_start = INT_MIN;
1882*38fd1498Szrj late_start = INT_MAX;
1883*38fd1498Szrj start = INT_MIN;
1884*38fd1498Szrj end = INT_MAX;
1885*38fd1498Szrj step = 1;
1886*38fd1498Szrj
1887*38fd1498Szrj count_preds = 0;
1888*38fd1498Szrj count_succs = 0;
1889*38fd1498Szrj
1890*38fd1498Szrj if (dump_file && (psp_not_empty || pss_not_empty))
1891*38fd1498Szrj {
1892*38fd1498Szrj fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1893*38fd1498Szrj "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1894*38fd1498Szrj fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1895*38fd1498Szrj "start", "early start", "late start", "end", "time");
1896*38fd1498Szrj fprintf (dump_file, "=========== =========== =========== ==========="
1897*38fd1498Szrj " =====\n");
1898*38fd1498Szrj }
1899*38fd1498Szrj /* Calculate early_start and limit end. Both bounds are inclusive. */
1900*38fd1498Szrj if (psp_not_empty)
1901*38fd1498Szrj for (e = u_node->in; e != 0; e = e->next_in)
1902*38fd1498Szrj {
1903*38fd1498Szrj int v = e->src->cuid;
1904*38fd1498Szrj
1905*38fd1498Szrj if (bitmap_bit_p (sched_nodes, v))
1906*38fd1498Szrj {
1907*38fd1498Szrj int p_st = SCHED_TIME (v);
1908*38fd1498Szrj int earliest = p_st + e->latency - (e->distance * ii);
1909*38fd1498Szrj int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1910*38fd1498Szrj
1911*38fd1498Szrj if (dump_file)
1912*38fd1498Szrj {
1913*38fd1498Szrj fprintf (dump_file, "%11s %11d %11s %11d %5d",
1914*38fd1498Szrj "", earliest, "", latest, p_st);
1915*38fd1498Szrj print_ddg_edge (dump_file, e);
1916*38fd1498Szrj fprintf (dump_file, "\n");
1917*38fd1498Szrj }
1918*38fd1498Szrj
1919*38fd1498Szrj early_start = MAX (early_start, earliest);
1920*38fd1498Szrj end = MIN (end, latest);
1921*38fd1498Szrj
1922*38fd1498Szrj if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1923*38fd1498Szrj count_preds++;
1924*38fd1498Szrj }
1925*38fd1498Szrj }
1926*38fd1498Szrj
1927*38fd1498Szrj /* Calculate late_start and limit start. Both bounds are inclusive. */
1928*38fd1498Szrj if (pss_not_empty)
1929*38fd1498Szrj for (e = u_node->out; e != 0; e = e->next_out)
1930*38fd1498Szrj {
1931*38fd1498Szrj int v = e->dest->cuid;
1932*38fd1498Szrj
1933*38fd1498Szrj if (bitmap_bit_p (sched_nodes, v))
1934*38fd1498Szrj {
1935*38fd1498Szrj int s_st = SCHED_TIME (v);
1936*38fd1498Szrj int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1937*38fd1498Szrj int latest = s_st - e->latency + (e->distance * ii);
1938*38fd1498Szrj
1939*38fd1498Szrj if (dump_file)
1940*38fd1498Szrj {
1941*38fd1498Szrj fprintf (dump_file, "%11d %11s %11d %11s %5d",
1942*38fd1498Szrj earliest, "", latest, "", s_st);
1943*38fd1498Szrj print_ddg_edge (dump_file, e);
1944*38fd1498Szrj fprintf (dump_file, "\n");
1945*38fd1498Szrj }
1946*38fd1498Szrj
1947*38fd1498Szrj start = MAX (start, earliest);
1948*38fd1498Szrj late_start = MIN (late_start, latest);
1949*38fd1498Szrj
1950*38fd1498Szrj if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1951*38fd1498Szrj count_succs++;
1952*38fd1498Szrj }
1953*38fd1498Szrj }
1954*38fd1498Szrj
1955*38fd1498Szrj if (dump_file && (psp_not_empty || pss_not_empty))
1956*38fd1498Szrj {
1957*38fd1498Szrj fprintf (dump_file, "----------- ----------- ----------- -----------"
1958*38fd1498Szrj " -----\n");
1959*38fd1498Szrj fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1960*38fd1498Szrj start, early_start, late_start, end, "",
1961*38fd1498Szrj "(max, max, min, min)");
1962*38fd1498Szrj }
1963*38fd1498Szrj
1964*38fd1498Szrj /* Get a target scheduling window no bigger than ii. */
1965*38fd1498Szrj if (early_start == INT_MIN && late_start == INT_MAX)
1966*38fd1498Szrj early_start = NODE_ASAP (u_node);
1967*38fd1498Szrj else if (early_start == INT_MIN)
1968*38fd1498Szrj early_start = late_start - (ii - 1);
1969*38fd1498Szrj late_start = MIN (late_start, early_start + (ii - 1));
1970*38fd1498Szrj
1971*38fd1498Szrj /* Apply memory dependence limits. */
1972*38fd1498Szrj start = MAX (start, early_start);
1973*38fd1498Szrj end = MIN (end, late_start);
1974*38fd1498Szrj
1975*38fd1498Szrj if (dump_file && (psp_not_empty || pss_not_empty))
1976*38fd1498Szrj fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1977*38fd1498Szrj "", start, end, "", "");
1978*38fd1498Szrj
1979*38fd1498Szrj /* If there are at least as many successors as predecessors, schedule the
1980*38fd1498Szrj node close to its successors. */
1981*38fd1498Szrj if (pss_not_empty && count_succs >= count_preds)
1982*38fd1498Szrj {
1983*38fd1498Szrj std::swap (start, end);
1984*38fd1498Szrj step = -1;
1985*38fd1498Szrj }
1986*38fd1498Szrj
1987*38fd1498Szrj /* Now that we've finalized the window, make END an exclusive rather
1988*38fd1498Szrj than an inclusive bound. */
1989*38fd1498Szrj end += step;
1990*38fd1498Szrj
1991*38fd1498Szrj *start_p = start;
1992*38fd1498Szrj *step_p = step;
1993*38fd1498Szrj *end_p = end;
1994*38fd1498Szrj
1995*38fd1498Szrj if ((start >= end && step == 1) || (start <= end && step == -1))
1996*38fd1498Szrj {
1997*38fd1498Szrj if (dump_file)
1998*38fd1498Szrj fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
1999*38fd1498Szrj start, end, step);
2000*38fd1498Szrj return -1;
2001*38fd1498Szrj }
2002*38fd1498Szrj
2003*38fd1498Szrj return 0;
2004*38fd1498Szrj }
2005*38fd1498Szrj
2006*38fd1498Szrj /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
2007*38fd1498Szrj node currently been scheduled. At the end of the calculation
2008*38fd1498Szrj MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
2009*38fd1498Szrj U_NODE which are (1) already scheduled in the first/last row of
2010*38fd1498Szrj U_NODE's scheduling window, (2) whose dependence inequality with U
2011*38fd1498Szrj becomes an equality when U is scheduled in this same row, and (3)
2012*38fd1498Szrj whose dependence latency is zero.
2013*38fd1498Szrj
2014*38fd1498Szrj The first and last rows are calculated using the following parameters:
2015*38fd1498Szrj START/END rows - The cycles that begins/ends the traversal on the window;
2016*38fd1498Szrj searching for an empty cycle to schedule U_NODE.
2017*38fd1498Szrj STEP - The direction in which we traverse the window.
2018*38fd1498Szrj II - The initiation interval. */
2019*38fd1498Szrj
2020*38fd1498Szrj static void
calculate_must_precede_follow(ddg_node_ptr u_node,int start,int end,int step,int ii,sbitmap sched_nodes,sbitmap must_precede,sbitmap must_follow)2021*38fd1498Szrj calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
2022*38fd1498Szrj int step, int ii, sbitmap sched_nodes,
2023*38fd1498Szrj sbitmap must_precede, sbitmap must_follow)
2024*38fd1498Szrj {
2025*38fd1498Szrj ddg_edge_ptr e;
2026*38fd1498Szrj int first_cycle_in_window, last_cycle_in_window;
2027*38fd1498Szrj
2028*38fd1498Szrj gcc_assert (must_precede && must_follow);
2029*38fd1498Szrj
2030*38fd1498Szrj /* Consider the following scheduling window:
2031*38fd1498Szrj {first_cycle_in_window, first_cycle_in_window+1, ...,
2032*38fd1498Szrj last_cycle_in_window}. If step is 1 then the following will be
2033*38fd1498Szrj the order we traverse the window: {start=first_cycle_in_window,
2034*38fd1498Szrj first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
2035*38fd1498Szrj or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
2036*38fd1498Szrj end=first_cycle_in_window-1} if step is -1. */
2037*38fd1498Szrj first_cycle_in_window = (step == 1) ? start : end - step;
2038*38fd1498Szrj last_cycle_in_window = (step == 1) ? end - step : start;
2039*38fd1498Szrj
2040*38fd1498Szrj bitmap_clear (must_precede);
2041*38fd1498Szrj bitmap_clear (must_follow);
2042*38fd1498Szrj
2043*38fd1498Szrj if (dump_file)
2044*38fd1498Szrj fprintf (dump_file, "\nmust_precede: ");
2045*38fd1498Szrj
2046*38fd1498Szrj /* Instead of checking if:
2047*38fd1498Szrj (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
2048*38fd1498Szrj && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
2049*38fd1498Szrj first_cycle_in_window)
2050*38fd1498Szrj && e->latency == 0
2051*38fd1498Szrj we use the fact that latency is non-negative:
2052*38fd1498Szrj SCHED_TIME (e->src) - (e->distance * ii) <=
2053*38fd1498Szrj SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
2054*38fd1498Szrj first_cycle_in_window
2055*38fd1498Szrj and check only if
2056*38fd1498Szrj SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */
2057*38fd1498Szrj for (e = u_node->in; e != 0; e = e->next_in)
2058*38fd1498Szrj if (bitmap_bit_p (sched_nodes, e->src->cuid)
2059*38fd1498Szrj && ((SCHED_TIME (e->src->cuid) - (e->distance * ii)) ==
2060*38fd1498Szrj first_cycle_in_window))
2061*38fd1498Szrj {
2062*38fd1498Szrj if (dump_file)
2063*38fd1498Szrj fprintf (dump_file, "%d ", e->src->cuid);
2064*38fd1498Szrj
2065*38fd1498Szrj bitmap_set_bit (must_precede, e->src->cuid);
2066*38fd1498Szrj }
2067*38fd1498Szrj
2068*38fd1498Szrj if (dump_file)
2069*38fd1498Szrj fprintf (dump_file, "\nmust_follow: ");
2070*38fd1498Szrj
2071*38fd1498Szrj /* Instead of checking if:
2072*38fd1498Szrj (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
2073*38fd1498Szrj && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
2074*38fd1498Szrj last_cycle_in_window)
2075*38fd1498Szrj && e->latency == 0
2076*38fd1498Szrj we use the fact that latency is non-negative:
2077*38fd1498Szrj SCHED_TIME (e->dest) + (e->distance * ii) >=
2078*38fd1498Szrj SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
2079*38fd1498Szrj last_cycle_in_window
2080*38fd1498Szrj and check only if
2081*38fd1498Szrj SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */
2082*38fd1498Szrj for (e = u_node->out; e != 0; e = e->next_out)
2083*38fd1498Szrj if (bitmap_bit_p (sched_nodes, e->dest->cuid)
2084*38fd1498Szrj && ((SCHED_TIME (e->dest->cuid) + (e->distance * ii)) ==
2085*38fd1498Szrj last_cycle_in_window))
2086*38fd1498Szrj {
2087*38fd1498Szrj if (dump_file)
2088*38fd1498Szrj fprintf (dump_file, "%d ", e->dest->cuid);
2089*38fd1498Szrj
2090*38fd1498Szrj bitmap_set_bit (must_follow, e->dest->cuid);
2091*38fd1498Szrj }
2092*38fd1498Szrj
2093*38fd1498Szrj if (dump_file)
2094*38fd1498Szrj fprintf (dump_file, "\n");
2095*38fd1498Szrj }
2096*38fd1498Szrj
2097*38fd1498Szrj /* Return 1 if U_NODE can be scheduled in CYCLE. Use the following
2098*38fd1498Szrj parameters to decide if that's possible:
2099*38fd1498Szrj PS - The partial schedule.
2100*38fd1498Szrj U - The serial number of U_NODE.
2101*38fd1498Szrj NUM_SPLITS - The number of row splits made so far.
2102*38fd1498Szrj MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
2103*38fd1498Szrj the first row of the scheduling window)
2104*38fd1498Szrj MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
2105*38fd1498Szrj last row of the scheduling window) */
2106*38fd1498Szrj
2107*38fd1498Szrj static bool
try_scheduling_node_in_cycle(partial_schedule_ptr ps,int u,int cycle,sbitmap sched_nodes,int * num_splits,sbitmap must_precede,sbitmap must_follow)2108*38fd1498Szrj try_scheduling_node_in_cycle (partial_schedule_ptr ps,
2109*38fd1498Szrj int u, int cycle, sbitmap sched_nodes,
2110*38fd1498Szrj int *num_splits, sbitmap must_precede,
2111*38fd1498Szrj sbitmap must_follow)
2112*38fd1498Szrj {
2113*38fd1498Szrj ps_insn_ptr psi;
2114*38fd1498Szrj bool success = 0;
2115*38fd1498Szrj
2116*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2117*38fd1498Szrj psi = ps_add_node_check_conflicts (ps, u, cycle, must_precede, must_follow);
2118*38fd1498Szrj if (psi)
2119*38fd1498Szrj {
2120*38fd1498Szrj SCHED_TIME (u) = cycle;
2121*38fd1498Szrj bitmap_set_bit (sched_nodes, u);
2122*38fd1498Szrj success = 1;
2123*38fd1498Szrj *num_splits = 0;
2124*38fd1498Szrj if (dump_file)
2125*38fd1498Szrj fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
2126*38fd1498Szrj
2127*38fd1498Szrj }
2128*38fd1498Szrj
2129*38fd1498Szrj return success;
2130*38fd1498Szrj }
2131*38fd1498Szrj
2132*38fd1498Szrj /* This function implements the scheduling algorithm for SMS according to the
2133*38fd1498Szrj above algorithm. */
2134*38fd1498Szrj static partial_schedule_ptr
sms_schedule_by_order(ddg_ptr g,int mii,int maxii,int * nodes_order)2135*38fd1498Szrj sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
2136*38fd1498Szrj {
2137*38fd1498Szrj int ii = mii;
2138*38fd1498Szrj int i, c, success, num_splits = 0;
2139*38fd1498Szrj int flush_and_start_over = true;
2140*38fd1498Szrj int num_nodes = g->num_nodes;
2141*38fd1498Szrj int start, end, step; /* Place together into one struct? */
2142*38fd1498Szrj auto_sbitmap sched_nodes (num_nodes);
2143*38fd1498Szrj auto_sbitmap must_precede (num_nodes);
2144*38fd1498Szrj auto_sbitmap must_follow (num_nodes);
2145*38fd1498Szrj auto_sbitmap tobe_scheduled (num_nodes);
2146*38fd1498Szrj
2147*38fd1498Szrj partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
2148*38fd1498Szrj
2149*38fd1498Szrj bitmap_ones (tobe_scheduled);
2150*38fd1498Szrj bitmap_clear (sched_nodes);
2151*38fd1498Szrj
2152*38fd1498Szrj while (flush_and_start_over && (ii < maxii))
2153*38fd1498Szrj {
2154*38fd1498Szrj
2155*38fd1498Szrj if (dump_file)
2156*38fd1498Szrj fprintf (dump_file, "Starting with ii=%d\n", ii);
2157*38fd1498Szrj flush_and_start_over = false;
2158*38fd1498Szrj bitmap_clear (sched_nodes);
2159*38fd1498Szrj
2160*38fd1498Szrj for (i = 0; i < num_nodes; i++)
2161*38fd1498Szrj {
2162*38fd1498Szrj int u = nodes_order[i];
2163*38fd1498Szrj ddg_node_ptr u_node = &ps->g->nodes[u];
2164*38fd1498Szrj rtx_insn *insn = u_node->insn;
2165*38fd1498Szrj
2166*38fd1498Szrj if (!NONDEBUG_INSN_P (insn))
2167*38fd1498Szrj {
2168*38fd1498Szrj bitmap_clear_bit (tobe_scheduled, u);
2169*38fd1498Szrj continue;
2170*38fd1498Szrj }
2171*38fd1498Szrj
2172*38fd1498Szrj if (bitmap_bit_p (sched_nodes, u))
2173*38fd1498Szrj continue;
2174*38fd1498Szrj
2175*38fd1498Szrj /* Try to get non-empty scheduling window. */
2176*38fd1498Szrj success = 0;
2177*38fd1498Szrj if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
2178*38fd1498Szrj &step, &end) == 0)
2179*38fd1498Szrj {
2180*38fd1498Szrj if (dump_file)
2181*38fd1498Szrj fprintf (dump_file, "\nTrying to schedule node %d "
2182*38fd1498Szrj "INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID
2183*38fd1498Szrj (g->nodes[u].insn)), start, end, step);
2184*38fd1498Szrj
2185*38fd1498Szrj gcc_assert ((step > 0 && start < end)
2186*38fd1498Szrj || (step < 0 && start > end));
2187*38fd1498Szrj
2188*38fd1498Szrj calculate_must_precede_follow (u_node, start, end, step, ii,
2189*38fd1498Szrj sched_nodes, must_precede,
2190*38fd1498Szrj must_follow);
2191*38fd1498Szrj
2192*38fd1498Szrj for (c = start; c != end; c += step)
2193*38fd1498Szrj {
2194*38fd1498Szrj sbitmap tmp_precede, tmp_follow;
2195*38fd1498Szrj
2196*38fd1498Szrj set_must_precede_follow (&tmp_follow, must_follow,
2197*38fd1498Szrj &tmp_precede, must_precede,
2198*38fd1498Szrj c, start, end, step);
2199*38fd1498Szrj success =
2200*38fd1498Szrj try_scheduling_node_in_cycle (ps, u, c,
2201*38fd1498Szrj sched_nodes,
2202*38fd1498Szrj &num_splits, tmp_precede,
2203*38fd1498Szrj tmp_follow);
2204*38fd1498Szrj if (success)
2205*38fd1498Szrj break;
2206*38fd1498Szrj }
2207*38fd1498Szrj
2208*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2209*38fd1498Szrj }
2210*38fd1498Szrj if (!success)
2211*38fd1498Szrj {
2212*38fd1498Szrj int split_row;
2213*38fd1498Szrj
2214*38fd1498Szrj if (ii++ == maxii)
2215*38fd1498Szrj break;
2216*38fd1498Szrj
2217*38fd1498Szrj if (num_splits >= MAX_SPLIT_NUM)
2218*38fd1498Szrj {
2219*38fd1498Szrj num_splits = 0;
2220*38fd1498Szrj flush_and_start_over = true;
2221*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2222*38fd1498Szrj reset_partial_schedule (ps, ii);
2223*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2224*38fd1498Szrj break;
2225*38fd1498Szrj }
2226*38fd1498Szrj
2227*38fd1498Szrj num_splits++;
2228*38fd1498Szrj /* The scheduling window is exclusive of 'end'
2229*38fd1498Szrj whereas compute_split_window() expects an inclusive,
2230*38fd1498Szrj ordered range. */
2231*38fd1498Szrj if (step == 1)
2232*38fd1498Szrj split_row = compute_split_row (sched_nodes, start, end - 1,
2233*38fd1498Szrj ps->ii, u_node);
2234*38fd1498Szrj else
2235*38fd1498Szrj split_row = compute_split_row (sched_nodes, end + 1, start,
2236*38fd1498Szrj ps->ii, u_node);
2237*38fd1498Szrj
2238*38fd1498Szrj ps_insert_empty_row (ps, split_row, sched_nodes);
2239*38fd1498Szrj i--; /* Go back and retry node i. */
2240*38fd1498Szrj
2241*38fd1498Szrj if (dump_file)
2242*38fd1498Szrj fprintf (dump_file, "num_splits=%d\n", num_splits);
2243*38fd1498Szrj }
2244*38fd1498Szrj
2245*38fd1498Szrj /* ??? If (success), check register pressure estimates. */
2246*38fd1498Szrj } /* Continue with next node. */
2247*38fd1498Szrj } /* While flush_and_start_over. */
2248*38fd1498Szrj if (ii >= maxii)
2249*38fd1498Szrj {
2250*38fd1498Szrj free_partial_schedule (ps);
2251*38fd1498Szrj ps = NULL;
2252*38fd1498Szrj }
2253*38fd1498Szrj else
2254*38fd1498Szrj gcc_assert (bitmap_equal_p (tobe_scheduled, sched_nodes));
2255*38fd1498Szrj
2256*38fd1498Szrj return ps;
2257*38fd1498Szrj }
2258*38fd1498Szrj
2259*38fd1498Szrj /* This function inserts a new empty row into PS at the position
2260*38fd1498Szrj according to SPLITROW, keeping all already scheduled instructions
2261*38fd1498Szrj intact and updating their SCHED_TIME and cycle accordingly. */
2262*38fd1498Szrj static void
ps_insert_empty_row(partial_schedule_ptr ps,int split_row,sbitmap sched_nodes)2263*38fd1498Szrj ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2264*38fd1498Szrj sbitmap sched_nodes)
2265*38fd1498Szrj {
2266*38fd1498Szrj ps_insn_ptr crr_insn;
2267*38fd1498Szrj ps_insn_ptr *rows_new;
2268*38fd1498Szrj int ii = ps->ii;
2269*38fd1498Szrj int new_ii = ii + 1;
2270*38fd1498Szrj int row;
2271*38fd1498Szrj int *rows_length_new;
2272*38fd1498Szrj
2273*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2274*38fd1498Szrj
2275*38fd1498Szrj /* We normalize sched_time and rotate ps to have only non-negative sched
2276*38fd1498Szrj times, for simplicity of updating cycles after inserting new row. */
2277*38fd1498Szrj split_row -= ps->min_cycle;
2278*38fd1498Szrj split_row = SMODULO (split_row, ii);
2279*38fd1498Szrj if (dump_file)
2280*38fd1498Szrj fprintf (dump_file, "split_row=%d\n", split_row);
2281*38fd1498Szrj
2282*38fd1498Szrj reset_sched_times (ps, PS_MIN_CYCLE (ps));
2283*38fd1498Szrj rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2284*38fd1498Szrj
2285*38fd1498Szrj rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2286*38fd1498Szrj rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2287*38fd1498Szrj for (row = 0; row < split_row; row++)
2288*38fd1498Szrj {
2289*38fd1498Szrj rows_new[row] = ps->rows[row];
2290*38fd1498Szrj rows_length_new[row] = ps->rows_length[row];
2291*38fd1498Szrj ps->rows[row] = NULL;
2292*38fd1498Szrj for (crr_insn = rows_new[row];
2293*38fd1498Szrj crr_insn; crr_insn = crr_insn->next_in_row)
2294*38fd1498Szrj {
2295*38fd1498Szrj int u = crr_insn->id;
2296*38fd1498Szrj int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2297*38fd1498Szrj
2298*38fd1498Szrj SCHED_TIME (u) = new_time;
2299*38fd1498Szrj crr_insn->cycle = new_time;
2300*38fd1498Szrj SCHED_ROW (u) = new_time % new_ii;
2301*38fd1498Szrj SCHED_STAGE (u) = new_time / new_ii;
2302*38fd1498Szrj }
2303*38fd1498Szrj
2304*38fd1498Szrj }
2305*38fd1498Szrj
2306*38fd1498Szrj rows_new[split_row] = NULL;
2307*38fd1498Szrj
2308*38fd1498Szrj for (row = split_row; row < ii; row++)
2309*38fd1498Szrj {
2310*38fd1498Szrj rows_new[row + 1] = ps->rows[row];
2311*38fd1498Szrj rows_length_new[row + 1] = ps->rows_length[row];
2312*38fd1498Szrj ps->rows[row] = NULL;
2313*38fd1498Szrj for (crr_insn = rows_new[row + 1];
2314*38fd1498Szrj crr_insn; crr_insn = crr_insn->next_in_row)
2315*38fd1498Szrj {
2316*38fd1498Szrj int u = crr_insn->id;
2317*38fd1498Szrj int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2318*38fd1498Szrj
2319*38fd1498Szrj SCHED_TIME (u) = new_time;
2320*38fd1498Szrj crr_insn->cycle = new_time;
2321*38fd1498Szrj SCHED_ROW (u) = new_time % new_ii;
2322*38fd1498Szrj SCHED_STAGE (u) = new_time / new_ii;
2323*38fd1498Szrj }
2324*38fd1498Szrj }
2325*38fd1498Szrj
2326*38fd1498Szrj /* Updating ps. */
2327*38fd1498Szrj ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2328*38fd1498Szrj + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2329*38fd1498Szrj ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2330*38fd1498Szrj + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2331*38fd1498Szrj free (ps->rows);
2332*38fd1498Szrj ps->rows = rows_new;
2333*38fd1498Szrj free (ps->rows_length);
2334*38fd1498Szrj ps->rows_length = rows_length_new;
2335*38fd1498Szrj ps->ii = new_ii;
2336*38fd1498Szrj gcc_assert (ps->min_cycle >= 0);
2337*38fd1498Szrj
2338*38fd1498Szrj verify_partial_schedule (ps, sched_nodes);
2339*38fd1498Szrj
2340*38fd1498Szrj if (dump_file)
2341*38fd1498Szrj fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2342*38fd1498Szrj ps->max_cycle);
2343*38fd1498Szrj }
2344*38fd1498Szrj
2345*38fd1498Szrj /* Given U_NODE which is the node that failed to be scheduled; LOW and
2346*38fd1498Szrj UP which are the boundaries of it's scheduling window; compute using
2347*38fd1498Szrj SCHED_NODES and II a row in the partial schedule that can be split
2348*38fd1498Szrj which will separate a critical predecessor from a critical successor
2349*38fd1498Szrj thereby expanding the window, and return it. */
2350*38fd1498Szrj static int
compute_split_row(sbitmap sched_nodes,int low,int up,int ii,ddg_node_ptr u_node)2351*38fd1498Szrj compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2352*38fd1498Szrj ddg_node_ptr u_node)
2353*38fd1498Szrj {
2354*38fd1498Szrj ddg_edge_ptr e;
2355*38fd1498Szrj int lower = INT_MIN, upper = INT_MAX;
2356*38fd1498Szrj int crit_pred = -1;
2357*38fd1498Szrj int crit_succ = -1;
2358*38fd1498Szrj int crit_cycle;
2359*38fd1498Szrj
2360*38fd1498Szrj for (e = u_node->in; e != 0; e = e->next_in)
2361*38fd1498Szrj {
2362*38fd1498Szrj int v = e->src->cuid;
2363*38fd1498Szrj
2364*38fd1498Szrj if (bitmap_bit_p (sched_nodes, v)
2365*38fd1498Szrj && (low == SCHED_TIME (v) + e->latency - (e->distance * ii)))
2366*38fd1498Szrj if (SCHED_TIME (v) > lower)
2367*38fd1498Szrj {
2368*38fd1498Szrj crit_pred = v;
2369*38fd1498Szrj lower = SCHED_TIME (v);
2370*38fd1498Szrj }
2371*38fd1498Szrj }
2372*38fd1498Szrj
2373*38fd1498Szrj if (crit_pred >= 0)
2374*38fd1498Szrj {
2375*38fd1498Szrj crit_cycle = SCHED_TIME (crit_pred) + 1;
2376*38fd1498Szrj return SMODULO (crit_cycle, ii);
2377*38fd1498Szrj }
2378*38fd1498Szrj
2379*38fd1498Szrj for (e = u_node->out; e != 0; e = e->next_out)
2380*38fd1498Szrj {
2381*38fd1498Szrj int v = e->dest->cuid;
2382*38fd1498Szrj
2383*38fd1498Szrj if (bitmap_bit_p (sched_nodes, v)
2384*38fd1498Szrj && (up == SCHED_TIME (v) - e->latency + (e->distance * ii)))
2385*38fd1498Szrj if (SCHED_TIME (v) < upper)
2386*38fd1498Szrj {
2387*38fd1498Szrj crit_succ = v;
2388*38fd1498Szrj upper = SCHED_TIME (v);
2389*38fd1498Szrj }
2390*38fd1498Szrj }
2391*38fd1498Szrj
2392*38fd1498Szrj if (crit_succ >= 0)
2393*38fd1498Szrj {
2394*38fd1498Szrj crit_cycle = SCHED_TIME (crit_succ);
2395*38fd1498Szrj return SMODULO (crit_cycle, ii);
2396*38fd1498Szrj }
2397*38fd1498Szrj
2398*38fd1498Szrj if (dump_file)
2399*38fd1498Szrj fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2400*38fd1498Szrj
2401*38fd1498Szrj return SMODULO ((low + up + 1) / 2, ii);
2402*38fd1498Szrj }
2403*38fd1498Szrj
2404*38fd1498Szrj static void
verify_partial_schedule(partial_schedule_ptr ps,sbitmap sched_nodes)2405*38fd1498Szrj verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2406*38fd1498Szrj {
2407*38fd1498Szrj int row;
2408*38fd1498Szrj ps_insn_ptr crr_insn;
2409*38fd1498Szrj
2410*38fd1498Szrj for (row = 0; row < ps->ii; row++)
2411*38fd1498Szrj {
2412*38fd1498Szrj int length = 0;
2413*38fd1498Szrj
2414*38fd1498Szrj for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2415*38fd1498Szrj {
2416*38fd1498Szrj int u = crr_insn->id;
2417*38fd1498Szrj
2418*38fd1498Szrj length++;
2419*38fd1498Szrj gcc_assert (bitmap_bit_p (sched_nodes, u));
2420*38fd1498Szrj /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2421*38fd1498Szrj popcount (sched_nodes) == number of insns in ps. */
2422*38fd1498Szrj gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2423*38fd1498Szrj gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2424*38fd1498Szrj }
2425*38fd1498Szrj
2426*38fd1498Szrj gcc_assert (ps->rows_length[row] == length);
2427*38fd1498Szrj }
2428*38fd1498Szrj }
2429*38fd1498Szrj
2430*38fd1498Szrj
2431*38fd1498Szrj /* This page implements the algorithm for ordering the nodes of a DDG
2432*38fd1498Szrj for modulo scheduling, activated through the
2433*38fd1498Szrj "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
2434*38fd1498Szrj
2435*38fd1498Szrj #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2436*38fd1498Szrj #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2437*38fd1498Szrj #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2438*38fd1498Szrj #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2439*38fd1498Szrj #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2440*38fd1498Szrj #define DEPTH(x) (ASAP ((x)))
2441*38fd1498Szrj
2442*38fd1498Szrj typedef struct node_order_params * nopa;
2443*38fd1498Szrj
2444*38fd1498Szrj static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2445*38fd1498Szrj static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2446*38fd1498Szrj static nopa calculate_order_params (ddg_ptr, int, int *);
2447*38fd1498Szrj static int find_max_asap (ddg_ptr, sbitmap);
2448*38fd1498Szrj static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2449*38fd1498Szrj static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2450*38fd1498Szrj
2451*38fd1498Szrj enum sms_direction {BOTTOMUP, TOPDOWN};
2452*38fd1498Szrj
2453*38fd1498Szrj struct node_order_params
2454*38fd1498Szrj {
2455*38fd1498Szrj int asap;
2456*38fd1498Szrj int alap;
2457*38fd1498Szrj int height;
2458*38fd1498Szrj };
2459*38fd1498Szrj
2460*38fd1498Szrj /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
2461*38fd1498Szrj static void
check_nodes_order(int * node_order,int num_nodes)2462*38fd1498Szrj check_nodes_order (int *node_order, int num_nodes)
2463*38fd1498Szrj {
2464*38fd1498Szrj int i;
2465*38fd1498Szrj auto_sbitmap tmp (num_nodes);
2466*38fd1498Szrj
2467*38fd1498Szrj bitmap_clear (tmp);
2468*38fd1498Szrj
2469*38fd1498Szrj if (dump_file)
2470*38fd1498Szrj fprintf (dump_file, "SMS final nodes order: \n");
2471*38fd1498Szrj
2472*38fd1498Szrj for (i = 0; i < num_nodes; i++)
2473*38fd1498Szrj {
2474*38fd1498Szrj int u = node_order[i];
2475*38fd1498Szrj
2476*38fd1498Szrj if (dump_file)
2477*38fd1498Szrj fprintf (dump_file, "%d ", u);
2478*38fd1498Szrj gcc_assert (u < num_nodes && u >= 0 && !bitmap_bit_p (tmp, u));
2479*38fd1498Szrj
2480*38fd1498Szrj bitmap_set_bit (tmp, u);
2481*38fd1498Szrj }
2482*38fd1498Szrj
2483*38fd1498Szrj if (dump_file)
2484*38fd1498Szrj fprintf (dump_file, "\n");
2485*38fd1498Szrj }
2486*38fd1498Szrj
2487*38fd1498Szrj /* Order the nodes of G for scheduling and pass the result in
2488*38fd1498Szrj NODE_ORDER. Also set aux.count of each node to ASAP.
2489*38fd1498Szrj Put maximal ASAP to PMAX_ASAP. Return the recMII for the given DDG. */
2490*38fd1498Szrj static int
sms_order_nodes(ddg_ptr g,int mii,int * node_order,int * pmax_asap)2491*38fd1498Szrj sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2492*38fd1498Szrj {
2493*38fd1498Szrj int i;
2494*38fd1498Szrj int rec_mii = 0;
2495*38fd1498Szrj ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2496*38fd1498Szrj
2497*38fd1498Szrj nopa nops = calculate_order_params (g, mii, pmax_asap);
2498*38fd1498Szrj
2499*38fd1498Szrj if (dump_file)
2500*38fd1498Szrj print_sccs (dump_file, sccs, g);
2501*38fd1498Szrj
2502*38fd1498Szrj order_nodes_of_sccs (sccs, node_order);
2503*38fd1498Szrj
2504*38fd1498Szrj if (sccs->num_sccs > 0)
2505*38fd1498Szrj /* First SCC has the largest recurrence_length. */
2506*38fd1498Szrj rec_mii = sccs->sccs[0]->recurrence_length;
2507*38fd1498Szrj
2508*38fd1498Szrj /* Save ASAP before destroying node_order_params. */
2509*38fd1498Szrj for (i = 0; i < g->num_nodes; i++)
2510*38fd1498Szrj {
2511*38fd1498Szrj ddg_node_ptr v = &g->nodes[i];
2512*38fd1498Szrj v->aux.count = ASAP (v);
2513*38fd1498Szrj }
2514*38fd1498Szrj
2515*38fd1498Szrj free (nops);
2516*38fd1498Szrj free_ddg_all_sccs (sccs);
2517*38fd1498Szrj check_nodes_order (node_order, g->num_nodes);
2518*38fd1498Szrj
2519*38fd1498Szrj return rec_mii;
2520*38fd1498Szrj }
2521*38fd1498Szrj
2522*38fd1498Szrj static void
order_nodes_of_sccs(ddg_all_sccs_ptr all_sccs,int * node_order)2523*38fd1498Szrj order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2524*38fd1498Szrj {
2525*38fd1498Szrj int i, pos = 0;
2526*38fd1498Szrj ddg_ptr g = all_sccs->ddg;
2527*38fd1498Szrj int num_nodes = g->num_nodes;
2528*38fd1498Szrj auto_sbitmap prev_sccs (num_nodes);
2529*38fd1498Szrj auto_sbitmap on_path (num_nodes);
2530*38fd1498Szrj auto_sbitmap tmp (num_nodes);
2531*38fd1498Szrj auto_sbitmap ones (num_nodes);
2532*38fd1498Szrj
2533*38fd1498Szrj bitmap_clear (prev_sccs);
2534*38fd1498Szrj bitmap_ones (ones);
2535*38fd1498Szrj
2536*38fd1498Szrj /* Perform the node ordering starting from the SCC with the highest recMII.
2537*38fd1498Szrj For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
2538*38fd1498Szrj for (i = 0; i < all_sccs->num_sccs; i++)
2539*38fd1498Szrj {
2540*38fd1498Szrj ddg_scc_ptr scc = all_sccs->sccs[i];
2541*38fd1498Szrj
2542*38fd1498Szrj /* Add nodes on paths from previous SCCs to the current SCC. */
2543*38fd1498Szrj find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2544*38fd1498Szrj bitmap_ior (tmp, scc->nodes, on_path);
2545*38fd1498Szrj
2546*38fd1498Szrj /* Add nodes on paths from the current SCC to previous SCCs. */
2547*38fd1498Szrj find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2548*38fd1498Szrj bitmap_ior (tmp, tmp, on_path);
2549*38fd1498Szrj
2550*38fd1498Szrj /* Remove nodes of previous SCCs from current extended SCC. */
2551*38fd1498Szrj bitmap_and_compl (tmp, tmp, prev_sccs);
2552*38fd1498Szrj
2553*38fd1498Szrj pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2554*38fd1498Szrj /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
2555*38fd1498Szrj }
2556*38fd1498Szrj
2557*38fd1498Szrj /* Handle the remaining nodes that do not belong to any scc. Each call
2558*38fd1498Szrj to order_nodes_in_scc handles a single connected component. */
2559*38fd1498Szrj while (pos < g->num_nodes)
2560*38fd1498Szrj {
2561*38fd1498Szrj bitmap_and_compl (tmp, ones, prev_sccs);
2562*38fd1498Szrj pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2563*38fd1498Szrj }
2564*38fd1498Szrj }
2565*38fd1498Szrj
2566*38fd1498Szrj /* MII is needed if we consider backarcs (that do not close recursive cycles). */
2567*38fd1498Szrj static struct node_order_params *
calculate_order_params(ddg_ptr g,int mii ATTRIBUTE_UNUSED,int * pmax_asap)2568*38fd1498Szrj calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2569*38fd1498Szrj {
2570*38fd1498Szrj int u;
2571*38fd1498Szrj int max_asap;
2572*38fd1498Szrj int num_nodes = g->num_nodes;
2573*38fd1498Szrj ddg_edge_ptr e;
2574*38fd1498Szrj /* Allocate a place to hold ordering params for each node in the DDG. */
2575*38fd1498Szrj nopa node_order_params_arr;
2576*38fd1498Szrj
2577*38fd1498Szrj /* Initialize of ASAP/ALAP/HEIGHT to zero. */
2578*38fd1498Szrj node_order_params_arr = (nopa) xcalloc (num_nodes,
2579*38fd1498Szrj sizeof (struct node_order_params));
2580*38fd1498Szrj
2581*38fd1498Szrj /* Set the aux pointer of each node to point to its order_params structure. */
2582*38fd1498Szrj for (u = 0; u < num_nodes; u++)
2583*38fd1498Szrj g->nodes[u].aux.info = &node_order_params_arr[u];
2584*38fd1498Szrj
2585*38fd1498Szrj /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2586*38fd1498Szrj calculate ASAP, ALAP, mobility, distance, and height for each node
2587*38fd1498Szrj in the dependence (direct acyclic) graph. */
2588*38fd1498Szrj
2589*38fd1498Szrj /* We assume that the nodes in the array are in topological order. */
2590*38fd1498Szrj
2591*38fd1498Szrj max_asap = 0;
2592*38fd1498Szrj for (u = 0; u < num_nodes; u++)
2593*38fd1498Szrj {
2594*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2595*38fd1498Szrj
2596*38fd1498Szrj ASAP (u_node) = 0;
2597*38fd1498Szrj for (e = u_node->in; e; e = e->next_in)
2598*38fd1498Szrj if (e->distance == 0)
2599*38fd1498Szrj ASAP (u_node) = MAX (ASAP (u_node),
2600*38fd1498Szrj ASAP (e->src) + e->latency);
2601*38fd1498Szrj max_asap = MAX (max_asap, ASAP (u_node));
2602*38fd1498Szrj }
2603*38fd1498Szrj
2604*38fd1498Szrj for (u = num_nodes - 1; u > -1; u--)
2605*38fd1498Szrj {
2606*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2607*38fd1498Szrj
2608*38fd1498Szrj ALAP (u_node) = max_asap;
2609*38fd1498Szrj HEIGHT (u_node) = 0;
2610*38fd1498Szrj for (e = u_node->out; e; e = e->next_out)
2611*38fd1498Szrj if (e->distance == 0)
2612*38fd1498Szrj {
2613*38fd1498Szrj ALAP (u_node) = MIN (ALAP (u_node),
2614*38fd1498Szrj ALAP (e->dest) - e->latency);
2615*38fd1498Szrj HEIGHT (u_node) = MAX (HEIGHT (u_node),
2616*38fd1498Szrj HEIGHT (e->dest) + e->latency);
2617*38fd1498Szrj }
2618*38fd1498Szrj }
2619*38fd1498Szrj if (dump_file)
2620*38fd1498Szrj {
2621*38fd1498Szrj fprintf (dump_file, "\nOrder params\n");
2622*38fd1498Szrj for (u = 0; u < num_nodes; u++)
2623*38fd1498Szrj {
2624*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2625*38fd1498Szrj
2626*38fd1498Szrj fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2627*38fd1498Szrj ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2628*38fd1498Szrj }
2629*38fd1498Szrj }
2630*38fd1498Szrj
2631*38fd1498Szrj *pmax_asap = max_asap;
2632*38fd1498Szrj return node_order_params_arr;
2633*38fd1498Szrj }
2634*38fd1498Szrj
2635*38fd1498Szrj static int
find_max_asap(ddg_ptr g,sbitmap nodes)2636*38fd1498Szrj find_max_asap (ddg_ptr g, sbitmap nodes)
2637*38fd1498Szrj {
2638*38fd1498Szrj unsigned int u = 0;
2639*38fd1498Szrj int max_asap = -1;
2640*38fd1498Szrj int result = -1;
2641*38fd1498Szrj sbitmap_iterator sbi;
2642*38fd1498Szrj
2643*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2644*38fd1498Szrj {
2645*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2646*38fd1498Szrj
2647*38fd1498Szrj if (max_asap < ASAP (u_node))
2648*38fd1498Szrj {
2649*38fd1498Szrj max_asap = ASAP (u_node);
2650*38fd1498Szrj result = u;
2651*38fd1498Szrj }
2652*38fd1498Szrj }
2653*38fd1498Szrj return result;
2654*38fd1498Szrj }
2655*38fd1498Szrj
2656*38fd1498Szrj static int
find_max_hv_min_mob(ddg_ptr g,sbitmap nodes)2657*38fd1498Szrj find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2658*38fd1498Szrj {
2659*38fd1498Szrj unsigned int u = 0;
2660*38fd1498Szrj int max_hv = -1;
2661*38fd1498Szrj int min_mob = INT_MAX;
2662*38fd1498Szrj int result = -1;
2663*38fd1498Szrj sbitmap_iterator sbi;
2664*38fd1498Szrj
2665*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2666*38fd1498Szrj {
2667*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2668*38fd1498Szrj
2669*38fd1498Szrj if (max_hv < HEIGHT (u_node))
2670*38fd1498Szrj {
2671*38fd1498Szrj max_hv = HEIGHT (u_node);
2672*38fd1498Szrj min_mob = MOB (u_node);
2673*38fd1498Szrj result = u;
2674*38fd1498Szrj }
2675*38fd1498Szrj else if ((max_hv == HEIGHT (u_node))
2676*38fd1498Szrj && (min_mob > MOB (u_node)))
2677*38fd1498Szrj {
2678*38fd1498Szrj min_mob = MOB (u_node);
2679*38fd1498Szrj result = u;
2680*38fd1498Szrj }
2681*38fd1498Szrj }
2682*38fd1498Szrj return result;
2683*38fd1498Szrj }
2684*38fd1498Szrj
2685*38fd1498Szrj static int
find_max_dv_min_mob(ddg_ptr g,sbitmap nodes)2686*38fd1498Szrj find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2687*38fd1498Szrj {
2688*38fd1498Szrj unsigned int u = 0;
2689*38fd1498Szrj int max_dv = -1;
2690*38fd1498Szrj int min_mob = INT_MAX;
2691*38fd1498Szrj int result = -1;
2692*38fd1498Szrj sbitmap_iterator sbi;
2693*38fd1498Szrj
2694*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (nodes, 0, u, sbi)
2695*38fd1498Szrj {
2696*38fd1498Szrj ddg_node_ptr u_node = &g->nodes[u];
2697*38fd1498Szrj
2698*38fd1498Szrj if (max_dv < DEPTH (u_node))
2699*38fd1498Szrj {
2700*38fd1498Szrj max_dv = DEPTH (u_node);
2701*38fd1498Szrj min_mob = MOB (u_node);
2702*38fd1498Szrj result = u;
2703*38fd1498Szrj }
2704*38fd1498Szrj else if ((max_dv == DEPTH (u_node))
2705*38fd1498Szrj && (min_mob > MOB (u_node)))
2706*38fd1498Szrj {
2707*38fd1498Szrj min_mob = MOB (u_node);
2708*38fd1498Szrj result = u;
2709*38fd1498Szrj }
2710*38fd1498Szrj }
2711*38fd1498Szrj return result;
2712*38fd1498Szrj }
2713*38fd1498Szrj
2714*38fd1498Szrj /* Places the nodes of SCC into the NODE_ORDER array starting
2715*38fd1498Szrj at position POS, according to the SMS ordering algorithm.
2716*38fd1498Szrj NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2717*38fd1498Szrj the NODE_ORDER array, starting from position zero. */
2718*38fd1498Szrj static int
order_nodes_in_scc(ddg_ptr g,sbitmap nodes_ordered,sbitmap scc,int * node_order,int pos)2719*38fd1498Szrj order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2720*38fd1498Szrj int * node_order, int pos)
2721*38fd1498Szrj {
2722*38fd1498Szrj enum sms_direction dir;
2723*38fd1498Szrj int num_nodes = g->num_nodes;
2724*38fd1498Szrj auto_sbitmap workset (num_nodes);
2725*38fd1498Szrj auto_sbitmap tmp (num_nodes);
2726*38fd1498Szrj sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2727*38fd1498Szrj auto_sbitmap predecessors (num_nodes);
2728*38fd1498Szrj auto_sbitmap successors (num_nodes);
2729*38fd1498Szrj
2730*38fd1498Szrj bitmap_clear (predecessors);
2731*38fd1498Szrj find_predecessors (predecessors, g, nodes_ordered);
2732*38fd1498Szrj
2733*38fd1498Szrj bitmap_clear (successors);
2734*38fd1498Szrj find_successors (successors, g, nodes_ordered);
2735*38fd1498Szrj
2736*38fd1498Szrj bitmap_clear (tmp);
2737*38fd1498Szrj if (bitmap_and (tmp, predecessors, scc))
2738*38fd1498Szrj {
2739*38fd1498Szrj bitmap_copy (workset, tmp);
2740*38fd1498Szrj dir = BOTTOMUP;
2741*38fd1498Szrj }
2742*38fd1498Szrj else if (bitmap_and (tmp, successors, scc))
2743*38fd1498Szrj {
2744*38fd1498Szrj bitmap_copy (workset, tmp);
2745*38fd1498Szrj dir = TOPDOWN;
2746*38fd1498Szrj }
2747*38fd1498Szrj else
2748*38fd1498Szrj {
2749*38fd1498Szrj int u;
2750*38fd1498Szrj
2751*38fd1498Szrj bitmap_clear (workset);
2752*38fd1498Szrj if ((u = find_max_asap (g, scc)) >= 0)
2753*38fd1498Szrj bitmap_set_bit (workset, u);
2754*38fd1498Szrj dir = BOTTOMUP;
2755*38fd1498Szrj }
2756*38fd1498Szrj
2757*38fd1498Szrj bitmap_clear (zero_bitmap);
2758*38fd1498Szrj while (!bitmap_equal_p (workset, zero_bitmap))
2759*38fd1498Szrj {
2760*38fd1498Szrj int v;
2761*38fd1498Szrj ddg_node_ptr v_node;
2762*38fd1498Szrj sbitmap v_node_preds;
2763*38fd1498Szrj sbitmap v_node_succs;
2764*38fd1498Szrj
2765*38fd1498Szrj if (dir == TOPDOWN)
2766*38fd1498Szrj {
2767*38fd1498Szrj while (!bitmap_equal_p (workset, zero_bitmap))
2768*38fd1498Szrj {
2769*38fd1498Szrj v = find_max_hv_min_mob (g, workset);
2770*38fd1498Szrj v_node = &g->nodes[v];
2771*38fd1498Szrj node_order[pos++] = v;
2772*38fd1498Szrj v_node_succs = NODE_SUCCESSORS (v_node);
2773*38fd1498Szrj bitmap_and (tmp, v_node_succs, scc);
2774*38fd1498Szrj
2775*38fd1498Szrj /* Don't consider the already ordered successors again. */
2776*38fd1498Szrj bitmap_and_compl (tmp, tmp, nodes_ordered);
2777*38fd1498Szrj bitmap_ior (workset, workset, tmp);
2778*38fd1498Szrj bitmap_clear_bit (workset, v);
2779*38fd1498Szrj bitmap_set_bit (nodes_ordered, v);
2780*38fd1498Szrj }
2781*38fd1498Szrj dir = BOTTOMUP;
2782*38fd1498Szrj bitmap_clear (predecessors);
2783*38fd1498Szrj find_predecessors (predecessors, g, nodes_ordered);
2784*38fd1498Szrj bitmap_and (workset, predecessors, scc);
2785*38fd1498Szrj }
2786*38fd1498Szrj else
2787*38fd1498Szrj {
2788*38fd1498Szrj while (!bitmap_equal_p (workset, zero_bitmap))
2789*38fd1498Szrj {
2790*38fd1498Szrj v = find_max_dv_min_mob (g, workset);
2791*38fd1498Szrj v_node = &g->nodes[v];
2792*38fd1498Szrj node_order[pos++] = v;
2793*38fd1498Szrj v_node_preds = NODE_PREDECESSORS (v_node);
2794*38fd1498Szrj bitmap_and (tmp, v_node_preds, scc);
2795*38fd1498Szrj
2796*38fd1498Szrj /* Don't consider the already ordered predecessors again. */
2797*38fd1498Szrj bitmap_and_compl (tmp, tmp, nodes_ordered);
2798*38fd1498Szrj bitmap_ior (workset, workset, tmp);
2799*38fd1498Szrj bitmap_clear_bit (workset, v);
2800*38fd1498Szrj bitmap_set_bit (nodes_ordered, v);
2801*38fd1498Szrj }
2802*38fd1498Szrj dir = TOPDOWN;
2803*38fd1498Szrj bitmap_clear (successors);
2804*38fd1498Szrj find_successors (successors, g, nodes_ordered);
2805*38fd1498Szrj bitmap_and (workset, successors, scc);
2806*38fd1498Szrj }
2807*38fd1498Szrj }
2808*38fd1498Szrj sbitmap_free (zero_bitmap);
2809*38fd1498Szrj return pos;
2810*38fd1498Szrj }
2811*38fd1498Szrj
2812*38fd1498Szrj
2813*38fd1498Szrj /* This page contains functions for manipulating partial-schedules during
2814*38fd1498Szrj modulo scheduling. */
2815*38fd1498Szrj
2816*38fd1498Szrj /* Create a partial schedule and allocate a memory to hold II rows. */
2817*38fd1498Szrj
2818*38fd1498Szrj static partial_schedule_ptr
create_partial_schedule(int ii,ddg_ptr g,int history)2819*38fd1498Szrj create_partial_schedule (int ii, ddg_ptr g, int history)
2820*38fd1498Szrj {
2821*38fd1498Szrj partial_schedule_ptr ps = XNEW (struct partial_schedule);
2822*38fd1498Szrj ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2823*38fd1498Szrj ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2824*38fd1498Szrj ps->reg_moves.create (0);
2825*38fd1498Szrj ps->ii = ii;
2826*38fd1498Szrj ps->history = history;
2827*38fd1498Szrj ps->min_cycle = INT_MAX;
2828*38fd1498Szrj ps->max_cycle = INT_MIN;
2829*38fd1498Szrj ps->g = g;
2830*38fd1498Szrj
2831*38fd1498Szrj return ps;
2832*38fd1498Szrj }
2833*38fd1498Szrj
2834*38fd1498Szrj /* Free the PS_INSNs in rows array of the given partial schedule.
2835*38fd1498Szrj ??? Consider caching the PS_INSN's. */
2836*38fd1498Szrj static void
free_ps_insns(partial_schedule_ptr ps)2837*38fd1498Szrj free_ps_insns (partial_schedule_ptr ps)
2838*38fd1498Szrj {
2839*38fd1498Szrj int i;
2840*38fd1498Szrj
2841*38fd1498Szrj for (i = 0; i < ps->ii; i++)
2842*38fd1498Szrj {
2843*38fd1498Szrj while (ps->rows[i])
2844*38fd1498Szrj {
2845*38fd1498Szrj ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2846*38fd1498Szrj
2847*38fd1498Szrj free (ps->rows[i]);
2848*38fd1498Szrj ps->rows[i] = ps_insn;
2849*38fd1498Szrj }
2850*38fd1498Szrj ps->rows[i] = NULL;
2851*38fd1498Szrj }
2852*38fd1498Szrj }
2853*38fd1498Szrj
2854*38fd1498Szrj /* Free all the memory allocated to the partial schedule. */
2855*38fd1498Szrj
2856*38fd1498Szrj static void
free_partial_schedule(partial_schedule_ptr ps)2857*38fd1498Szrj free_partial_schedule (partial_schedule_ptr ps)
2858*38fd1498Szrj {
2859*38fd1498Szrj ps_reg_move_info *move;
2860*38fd1498Szrj unsigned int i;
2861*38fd1498Szrj
2862*38fd1498Szrj if (!ps)
2863*38fd1498Szrj return;
2864*38fd1498Szrj
2865*38fd1498Szrj FOR_EACH_VEC_ELT (ps->reg_moves, i, move)
2866*38fd1498Szrj sbitmap_free (move->uses);
2867*38fd1498Szrj ps->reg_moves.release ();
2868*38fd1498Szrj
2869*38fd1498Szrj free_ps_insns (ps);
2870*38fd1498Szrj free (ps->rows);
2871*38fd1498Szrj free (ps->rows_length);
2872*38fd1498Szrj free (ps);
2873*38fd1498Szrj }
2874*38fd1498Szrj
2875*38fd1498Szrj /* Clear the rows array with its PS_INSNs, and create a new one with
2876*38fd1498Szrj NEW_II rows. */
2877*38fd1498Szrj
2878*38fd1498Szrj static void
reset_partial_schedule(partial_schedule_ptr ps,int new_ii)2879*38fd1498Szrj reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2880*38fd1498Szrj {
2881*38fd1498Szrj if (!ps)
2882*38fd1498Szrj return;
2883*38fd1498Szrj free_ps_insns (ps);
2884*38fd1498Szrj if (new_ii == ps->ii)
2885*38fd1498Szrj return;
2886*38fd1498Szrj ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2887*38fd1498Szrj * sizeof (ps_insn_ptr));
2888*38fd1498Szrj memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2889*38fd1498Szrj ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2890*38fd1498Szrj memset (ps->rows_length, 0, new_ii * sizeof (int));
2891*38fd1498Szrj ps->ii = new_ii;
2892*38fd1498Szrj ps->min_cycle = INT_MAX;
2893*38fd1498Szrj ps->max_cycle = INT_MIN;
2894*38fd1498Szrj }
2895*38fd1498Szrj
2896*38fd1498Szrj /* Prints the partial schedule as an ii rows array, for each rows
2897*38fd1498Szrj print the ids of the insns in it. */
2898*38fd1498Szrj void
print_partial_schedule(partial_schedule_ptr ps,FILE * dump)2899*38fd1498Szrj print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2900*38fd1498Szrj {
2901*38fd1498Szrj int i;
2902*38fd1498Szrj
2903*38fd1498Szrj for (i = 0; i < ps->ii; i++)
2904*38fd1498Szrj {
2905*38fd1498Szrj ps_insn_ptr ps_i = ps->rows[i];
2906*38fd1498Szrj
2907*38fd1498Szrj fprintf (dump, "\n[ROW %d ]: ", i);
2908*38fd1498Szrj while (ps_i)
2909*38fd1498Szrj {
2910*38fd1498Szrj rtx_insn *insn = ps_rtl_insn (ps, ps_i->id);
2911*38fd1498Szrj
2912*38fd1498Szrj if (JUMP_P (insn))
2913*38fd1498Szrj fprintf (dump, "%d (branch), ", INSN_UID (insn));
2914*38fd1498Szrj else
2915*38fd1498Szrj fprintf (dump, "%d, ", INSN_UID (insn));
2916*38fd1498Szrj
2917*38fd1498Szrj ps_i = ps_i->next_in_row;
2918*38fd1498Szrj }
2919*38fd1498Szrj }
2920*38fd1498Szrj }
2921*38fd1498Szrj
2922*38fd1498Szrj /* Creates an object of PS_INSN and initializes it to the given parameters. */
2923*38fd1498Szrj static ps_insn_ptr
create_ps_insn(int id,int cycle)2924*38fd1498Szrj create_ps_insn (int id, int cycle)
2925*38fd1498Szrj {
2926*38fd1498Szrj ps_insn_ptr ps_i = XNEW (struct ps_insn);
2927*38fd1498Szrj
2928*38fd1498Szrj ps_i->id = id;
2929*38fd1498Szrj ps_i->next_in_row = NULL;
2930*38fd1498Szrj ps_i->prev_in_row = NULL;
2931*38fd1498Szrj ps_i->cycle = cycle;
2932*38fd1498Szrj
2933*38fd1498Szrj return ps_i;
2934*38fd1498Szrj }
2935*38fd1498Szrj
2936*38fd1498Szrj
2937*38fd1498Szrj /* Removes the given PS_INSN from the partial schedule. */
2938*38fd1498Szrj static void
remove_node_from_ps(partial_schedule_ptr ps,ps_insn_ptr ps_i)2939*38fd1498Szrj remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2940*38fd1498Szrj {
2941*38fd1498Szrj int row;
2942*38fd1498Szrj
2943*38fd1498Szrj gcc_assert (ps && ps_i);
2944*38fd1498Szrj
2945*38fd1498Szrj row = SMODULO (ps_i->cycle, ps->ii);
2946*38fd1498Szrj if (! ps_i->prev_in_row)
2947*38fd1498Szrj {
2948*38fd1498Szrj gcc_assert (ps_i == ps->rows[row]);
2949*38fd1498Szrj ps->rows[row] = ps_i->next_in_row;
2950*38fd1498Szrj if (ps->rows[row])
2951*38fd1498Szrj ps->rows[row]->prev_in_row = NULL;
2952*38fd1498Szrj }
2953*38fd1498Szrj else
2954*38fd1498Szrj {
2955*38fd1498Szrj ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2956*38fd1498Szrj if (ps_i->next_in_row)
2957*38fd1498Szrj ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2958*38fd1498Szrj }
2959*38fd1498Szrj
2960*38fd1498Szrj ps->rows_length[row] -= 1;
2961*38fd1498Szrj free (ps_i);
2962*38fd1498Szrj return;
2963*38fd1498Szrj }
2964*38fd1498Szrj
2965*38fd1498Szrj /* Unlike what literature describes for modulo scheduling (which focuses
2966*38fd1498Szrj on VLIW machines) the order of the instructions inside a cycle is
2967*38fd1498Szrj important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2968*38fd1498Szrj where the current instruction should go relative to the already
2969*38fd1498Szrj scheduled instructions in the given cycle. Go over these
2970*38fd1498Szrj instructions and find the first possible column to put it in. */
2971*38fd1498Szrj static bool
ps_insn_find_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_precede,sbitmap must_follow)2972*38fd1498Szrj ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2973*38fd1498Szrj sbitmap must_precede, sbitmap must_follow)
2974*38fd1498Szrj {
2975*38fd1498Szrj ps_insn_ptr next_ps_i;
2976*38fd1498Szrj ps_insn_ptr first_must_follow = NULL;
2977*38fd1498Szrj ps_insn_ptr last_must_precede = NULL;
2978*38fd1498Szrj ps_insn_ptr last_in_row = NULL;
2979*38fd1498Szrj int row;
2980*38fd1498Szrj
2981*38fd1498Szrj if (! ps_i)
2982*38fd1498Szrj return false;
2983*38fd1498Szrj
2984*38fd1498Szrj row = SMODULO (ps_i->cycle, ps->ii);
2985*38fd1498Szrj
2986*38fd1498Szrj /* Find the first must follow and the last must precede
2987*38fd1498Szrj and insert the node immediately after the must precede
2988*38fd1498Szrj but make sure that it there is no must follow after it. */
2989*38fd1498Szrj for (next_ps_i = ps->rows[row];
2990*38fd1498Szrj next_ps_i;
2991*38fd1498Szrj next_ps_i = next_ps_i->next_in_row)
2992*38fd1498Szrj {
2993*38fd1498Szrj if (must_follow
2994*38fd1498Szrj && bitmap_bit_p (must_follow, next_ps_i->id)
2995*38fd1498Szrj && ! first_must_follow)
2996*38fd1498Szrj first_must_follow = next_ps_i;
2997*38fd1498Szrj if (must_precede && bitmap_bit_p (must_precede, next_ps_i->id))
2998*38fd1498Szrj {
2999*38fd1498Szrj /* If we have already met a node that must follow, then
3000*38fd1498Szrj there is no possible column. */
3001*38fd1498Szrj if (first_must_follow)
3002*38fd1498Szrj return false;
3003*38fd1498Szrj else
3004*38fd1498Szrj last_must_precede = next_ps_i;
3005*38fd1498Szrj }
3006*38fd1498Szrj /* The closing branch must be the last in the row. */
3007*38fd1498Szrj if (must_precede
3008*38fd1498Szrj && bitmap_bit_p (must_precede, next_ps_i->id)
3009*38fd1498Szrj && JUMP_P (ps_rtl_insn (ps, next_ps_i->id)))
3010*38fd1498Szrj return false;
3011*38fd1498Szrj
3012*38fd1498Szrj last_in_row = next_ps_i;
3013*38fd1498Szrj }
3014*38fd1498Szrj
3015*38fd1498Szrj /* The closing branch is scheduled as well. Make sure there is no
3016*38fd1498Szrj dependent instruction after it as the branch should be the last
3017*38fd1498Szrj instruction in the row. */
3018*38fd1498Szrj if (JUMP_P (ps_rtl_insn (ps, ps_i->id)))
3019*38fd1498Szrj {
3020*38fd1498Szrj if (first_must_follow)
3021*38fd1498Szrj return false;
3022*38fd1498Szrj if (last_in_row)
3023*38fd1498Szrj {
3024*38fd1498Szrj /* Make the branch the last in the row. New instructions
3025*38fd1498Szrj will be inserted at the beginning of the row or after the
3026*38fd1498Szrj last must_precede instruction thus the branch is guaranteed
3027*38fd1498Szrj to remain the last instruction in the row. */
3028*38fd1498Szrj last_in_row->next_in_row = ps_i;
3029*38fd1498Szrj ps_i->prev_in_row = last_in_row;
3030*38fd1498Szrj ps_i->next_in_row = NULL;
3031*38fd1498Szrj }
3032*38fd1498Szrj else
3033*38fd1498Szrj ps->rows[row] = ps_i;
3034*38fd1498Szrj return true;
3035*38fd1498Szrj }
3036*38fd1498Szrj
3037*38fd1498Szrj /* Now insert the node after INSERT_AFTER_PSI. */
3038*38fd1498Szrj
3039*38fd1498Szrj if (! last_must_precede)
3040*38fd1498Szrj {
3041*38fd1498Szrj ps_i->next_in_row = ps->rows[row];
3042*38fd1498Szrj ps_i->prev_in_row = NULL;
3043*38fd1498Szrj if (ps_i->next_in_row)
3044*38fd1498Szrj ps_i->next_in_row->prev_in_row = ps_i;
3045*38fd1498Szrj ps->rows[row] = ps_i;
3046*38fd1498Szrj }
3047*38fd1498Szrj else
3048*38fd1498Szrj {
3049*38fd1498Szrj ps_i->next_in_row = last_must_precede->next_in_row;
3050*38fd1498Szrj last_must_precede->next_in_row = ps_i;
3051*38fd1498Szrj ps_i->prev_in_row = last_must_precede;
3052*38fd1498Szrj if (ps_i->next_in_row)
3053*38fd1498Szrj ps_i->next_in_row->prev_in_row = ps_i;
3054*38fd1498Szrj }
3055*38fd1498Szrj
3056*38fd1498Szrj return true;
3057*38fd1498Szrj }
3058*38fd1498Szrj
3059*38fd1498Szrj /* Advances the PS_INSN one column in its current row; returns false
3060*38fd1498Szrj in failure and true in success. Bit N is set in MUST_FOLLOW if
3061*38fd1498Szrj the node with cuid N must be come after the node pointed to by
3062*38fd1498Szrj PS_I when scheduled in the same cycle. */
3063*38fd1498Szrj static int
ps_insn_advance_column(partial_schedule_ptr ps,ps_insn_ptr ps_i,sbitmap must_follow)3064*38fd1498Szrj ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
3065*38fd1498Szrj sbitmap must_follow)
3066*38fd1498Szrj {
3067*38fd1498Szrj ps_insn_ptr prev, next;
3068*38fd1498Szrj int row;
3069*38fd1498Szrj
3070*38fd1498Szrj if (!ps || !ps_i)
3071*38fd1498Szrj return false;
3072*38fd1498Szrj
3073*38fd1498Szrj row = SMODULO (ps_i->cycle, ps->ii);
3074*38fd1498Szrj
3075*38fd1498Szrj if (! ps_i->next_in_row)
3076*38fd1498Szrj return false;
3077*38fd1498Szrj
3078*38fd1498Szrj /* Check if next_in_row is dependent on ps_i, both having same sched
3079*38fd1498Szrj times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
3080*38fd1498Szrj if (must_follow && bitmap_bit_p (must_follow, ps_i->next_in_row->id))
3081*38fd1498Szrj return false;
3082*38fd1498Szrj
3083*38fd1498Szrj /* Advance PS_I over its next_in_row in the doubly linked list. */
3084*38fd1498Szrj prev = ps_i->prev_in_row;
3085*38fd1498Szrj next = ps_i->next_in_row;
3086*38fd1498Szrj
3087*38fd1498Szrj if (ps_i == ps->rows[row])
3088*38fd1498Szrj ps->rows[row] = next;
3089*38fd1498Szrj
3090*38fd1498Szrj ps_i->next_in_row = next->next_in_row;
3091*38fd1498Szrj
3092*38fd1498Szrj if (next->next_in_row)
3093*38fd1498Szrj next->next_in_row->prev_in_row = ps_i;
3094*38fd1498Szrj
3095*38fd1498Szrj next->next_in_row = ps_i;
3096*38fd1498Szrj ps_i->prev_in_row = next;
3097*38fd1498Szrj
3098*38fd1498Szrj next->prev_in_row = prev;
3099*38fd1498Szrj if (prev)
3100*38fd1498Szrj prev->next_in_row = next;
3101*38fd1498Szrj
3102*38fd1498Szrj return true;
3103*38fd1498Szrj }
3104*38fd1498Szrj
3105*38fd1498Szrj /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
3106*38fd1498Szrj Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
3107*38fd1498Szrj set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
3108*38fd1498Szrj before/after (respectively) the node pointed to by PS_I when scheduled
3109*38fd1498Szrj in the same cycle. */
3110*38fd1498Szrj static ps_insn_ptr
add_node_to_ps(partial_schedule_ptr ps,int id,int cycle,sbitmap must_precede,sbitmap must_follow)3111*38fd1498Szrj add_node_to_ps (partial_schedule_ptr ps, int id, int cycle,
3112*38fd1498Szrj sbitmap must_precede, sbitmap must_follow)
3113*38fd1498Szrj {
3114*38fd1498Szrj ps_insn_ptr ps_i;
3115*38fd1498Szrj int row = SMODULO (cycle, ps->ii);
3116*38fd1498Szrj
3117*38fd1498Szrj if (ps->rows_length[row] >= issue_rate)
3118*38fd1498Szrj return NULL;
3119*38fd1498Szrj
3120*38fd1498Szrj ps_i = create_ps_insn (id, cycle);
3121*38fd1498Szrj
3122*38fd1498Szrj /* Finds and inserts PS_I according to MUST_FOLLOW and
3123*38fd1498Szrj MUST_PRECEDE. */
3124*38fd1498Szrj if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
3125*38fd1498Szrj {
3126*38fd1498Szrj free (ps_i);
3127*38fd1498Szrj return NULL;
3128*38fd1498Szrj }
3129*38fd1498Szrj
3130*38fd1498Szrj ps->rows_length[row] += 1;
3131*38fd1498Szrj return ps_i;
3132*38fd1498Szrj }
3133*38fd1498Szrj
3134*38fd1498Szrj /* Advance time one cycle. Assumes DFA is being used. */
3135*38fd1498Szrj static void
advance_one_cycle(void)3136*38fd1498Szrj advance_one_cycle (void)
3137*38fd1498Szrj {
3138*38fd1498Szrj if (targetm.sched.dfa_pre_cycle_insn)
3139*38fd1498Szrj state_transition (curr_state,
3140*38fd1498Szrj targetm.sched.dfa_pre_cycle_insn ());
3141*38fd1498Szrj
3142*38fd1498Szrj state_transition (curr_state, NULL);
3143*38fd1498Szrj
3144*38fd1498Szrj if (targetm.sched.dfa_post_cycle_insn)
3145*38fd1498Szrj state_transition (curr_state,
3146*38fd1498Szrj targetm.sched.dfa_post_cycle_insn ());
3147*38fd1498Szrj }
3148*38fd1498Szrj
3149*38fd1498Szrj
3150*38fd1498Szrj
3151*38fd1498Szrj /* Checks if PS has resource conflicts according to DFA, starting from
3152*38fd1498Szrj FROM cycle to TO cycle; returns true if there are conflicts and false
3153*38fd1498Szrj if there are no conflicts. Assumes DFA is being used. */
3154*38fd1498Szrj static int
ps_has_conflicts(partial_schedule_ptr ps,int from,int to)3155*38fd1498Szrj ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
3156*38fd1498Szrj {
3157*38fd1498Szrj int cycle;
3158*38fd1498Szrj
3159*38fd1498Szrj state_reset (curr_state);
3160*38fd1498Szrj
3161*38fd1498Szrj for (cycle = from; cycle <= to; cycle++)
3162*38fd1498Szrj {
3163*38fd1498Szrj ps_insn_ptr crr_insn;
3164*38fd1498Szrj /* Holds the remaining issue slots in the current row. */
3165*38fd1498Szrj int can_issue_more = issue_rate;
3166*38fd1498Szrj
3167*38fd1498Szrj /* Walk through the DFA for the current row. */
3168*38fd1498Szrj for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
3169*38fd1498Szrj crr_insn;
3170*38fd1498Szrj crr_insn = crr_insn->next_in_row)
3171*38fd1498Szrj {
3172*38fd1498Szrj rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
3173*38fd1498Szrj
3174*38fd1498Szrj if (!NONDEBUG_INSN_P (insn))
3175*38fd1498Szrj continue;
3176*38fd1498Szrj
3177*38fd1498Szrj /* Check if there is room for the current insn. */
3178*38fd1498Szrj if (!can_issue_more || state_dead_lock_p (curr_state))
3179*38fd1498Szrj return true;
3180*38fd1498Szrj
3181*38fd1498Szrj /* Update the DFA state and return with failure if the DFA found
3182*38fd1498Szrj resource conflicts. */
3183*38fd1498Szrj if (state_transition (curr_state, insn) >= 0)
3184*38fd1498Szrj return true;
3185*38fd1498Szrj
3186*38fd1498Szrj if (targetm.sched.variable_issue)
3187*38fd1498Szrj can_issue_more =
3188*38fd1498Szrj targetm.sched.variable_issue (sched_dump, sched_verbose,
3189*38fd1498Szrj insn, can_issue_more);
3190*38fd1498Szrj /* A naked CLOBBER or USE generates no instruction, so don't
3191*38fd1498Szrj let them consume issue slots. */
3192*38fd1498Szrj else if (GET_CODE (PATTERN (insn)) != USE
3193*38fd1498Szrj && GET_CODE (PATTERN (insn)) != CLOBBER)
3194*38fd1498Szrj can_issue_more--;
3195*38fd1498Szrj }
3196*38fd1498Szrj
3197*38fd1498Szrj /* Advance the DFA to the next cycle. */
3198*38fd1498Szrj advance_one_cycle ();
3199*38fd1498Szrj }
3200*38fd1498Szrj return false;
3201*38fd1498Szrj }
3202*38fd1498Szrj
3203*38fd1498Szrj /* Checks if the given node causes resource conflicts when added to PS at
3204*38fd1498Szrj cycle C. If not the node is added to PS and returned; otherwise zero
3205*38fd1498Szrj is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3206*38fd1498Szrj cuid N must be come before/after (respectively) the node pointed to by
3207*38fd1498Szrj PS_I when scheduled in the same cycle. */
3208*38fd1498Szrj ps_insn_ptr
ps_add_node_check_conflicts(partial_schedule_ptr ps,int n,int c,sbitmap must_precede,sbitmap must_follow)3209*38fd1498Szrj ps_add_node_check_conflicts (partial_schedule_ptr ps, int n,
3210*38fd1498Szrj int c, sbitmap must_precede,
3211*38fd1498Szrj sbitmap must_follow)
3212*38fd1498Szrj {
3213*38fd1498Szrj int has_conflicts = 0;
3214*38fd1498Szrj ps_insn_ptr ps_i;
3215*38fd1498Szrj
3216*38fd1498Szrj /* First add the node to the PS, if this succeeds check for
3217*38fd1498Szrj conflicts, trying different issue slots in the same row. */
3218*38fd1498Szrj if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3219*38fd1498Szrj return NULL; /* Failed to insert the node at the given cycle. */
3220*38fd1498Szrj
3221*38fd1498Szrj has_conflicts = ps_has_conflicts (ps, c, c)
3222*38fd1498Szrj || (ps->history > 0
3223*38fd1498Szrj && ps_has_conflicts (ps,
3224*38fd1498Szrj c - ps->history,
3225*38fd1498Szrj c + ps->history));
3226*38fd1498Szrj
3227*38fd1498Szrj /* Try different issue slots to find one that the given node can be
3228*38fd1498Szrj scheduled in without conflicts. */
3229*38fd1498Szrj while (has_conflicts)
3230*38fd1498Szrj {
3231*38fd1498Szrj if (! ps_insn_advance_column (ps, ps_i, must_follow))
3232*38fd1498Szrj break;
3233*38fd1498Szrj has_conflicts = ps_has_conflicts (ps, c, c)
3234*38fd1498Szrj || (ps->history > 0
3235*38fd1498Szrj && ps_has_conflicts (ps,
3236*38fd1498Szrj c - ps->history,
3237*38fd1498Szrj c + ps->history));
3238*38fd1498Szrj }
3239*38fd1498Szrj
3240*38fd1498Szrj if (has_conflicts)
3241*38fd1498Szrj {
3242*38fd1498Szrj remove_node_from_ps (ps, ps_i);
3243*38fd1498Szrj return NULL;
3244*38fd1498Szrj }
3245*38fd1498Szrj
3246*38fd1498Szrj ps->min_cycle = MIN (ps->min_cycle, c);
3247*38fd1498Szrj ps->max_cycle = MAX (ps->max_cycle, c);
3248*38fd1498Szrj return ps_i;
3249*38fd1498Szrj }
3250*38fd1498Szrj
3251*38fd1498Szrj /* Calculate the stage count of the partial schedule PS. The calculation
3252*38fd1498Szrj takes into account the rotation amount passed in ROTATION_AMOUNT. */
3253*38fd1498Szrj int
calculate_stage_count(partial_schedule_ptr ps,int rotation_amount)3254*38fd1498Szrj calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3255*38fd1498Szrj {
3256*38fd1498Szrj int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3257*38fd1498Szrj int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3258*38fd1498Szrj int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3259*38fd1498Szrj
3260*38fd1498Szrj /* The calculation of stage count is done adding the number of stages
3261*38fd1498Szrj before cycle zero and after cycle zero. */
3262*38fd1498Szrj stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3263*38fd1498Szrj
3264*38fd1498Szrj return stage_count;
3265*38fd1498Szrj }
3266*38fd1498Szrj
3267*38fd1498Szrj /* Rotate the rows of PS such that insns scheduled at time
3268*38fd1498Szrj START_CYCLE will appear in row 0. Updates max/min_cycles. */
3269*38fd1498Szrj void
rotate_partial_schedule(partial_schedule_ptr ps,int start_cycle)3270*38fd1498Szrj rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3271*38fd1498Szrj {
3272*38fd1498Szrj int i, row, backward_rotates;
3273*38fd1498Szrj int last_row = ps->ii - 1;
3274*38fd1498Szrj
3275*38fd1498Szrj if (start_cycle == 0)
3276*38fd1498Szrj return;
3277*38fd1498Szrj
3278*38fd1498Szrj backward_rotates = SMODULO (start_cycle, ps->ii);
3279*38fd1498Szrj
3280*38fd1498Szrj /* Revisit later and optimize this into a single loop. */
3281*38fd1498Szrj for (i = 0; i < backward_rotates; i++)
3282*38fd1498Szrj {
3283*38fd1498Szrj ps_insn_ptr first_row = ps->rows[0];
3284*38fd1498Szrj int first_row_length = ps->rows_length[0];
3285*38fd1498Szrj
3286*38fd1498Szrj for (row = 0; row < last_row; row++)
3287*38fd1498Szrj {
3288*38fd1498Szrj ps->rows[row] = ps->rows[row + 1];
3289*38fd1498Szrj ps->rows_length[row] = ps->rows_length[row + 1];
3290*38fd1498Szrj }
3291*38fd1498Szrj
3292*38fd1498Szrj ps->rows[last_row] = first_row;
3293*38fd1498Szrj ps->rows_length[last_row] = first_row_length;
3294*38fd1498Szrj }
3295*38fd1498Szrj
3296*38fd1498Szrj ps->max_cycle -= start_cycle;
3297*38fd1498Szrj ps->min_cycle -= start_cycle;
3298*38fd1498Szrj }
3299*38fd1498Szrj
3300*38fd1498Szrj #endif /* INSN_SCHEDULING */
3301*38fd1498Szrj
3302*38fd1498Szrj /* Run instruction scheduler. */
3303*38fd1498Szrj /* Perform SMS module scheduling. */
3304*38fd1498Szrj
3305*38fd1498Szrj namespace {
3306*38fd1498Szrj
3307*38fd1498Szrj const pass_data pass_data_sms =
3308*38fd1498Szrj {
3309*38fd1498Szrj RTL_PASS, /* type */
3310*38fd1498Szrj "sms", /* name */
3311*38fd1498Szrj OPTGROUP_NONE, /* optinfo_flags */
3312*38fd1498Szrj TV_SMS, /* tv_id */
3313*38fd1498Szrj 0, /* properties_required */
3314*38fd1498Szrj 0, /* properties_provided */
3315*38fd1498Szrj 0, /* properties_destroyed */
3316*38fd1498Szrj 0, /* todo_flags_start */
3317*38fd1498Szrj TODO_df_finish, /* todo_flags_finish */
3318*38fd1498Szrj };
3319*38fd1498Szrj
3320*38fd1498Szrj class pass_sms : public rtl_opt_pass
3321*38fd1498Szrj {
3322*38fd1498Szrj public:
pass_sms(gcc::context * ctxt)3323*38fd1498Szrj pass_sms (gcc::context *ctxt)
3324*38fd1498Szrj : rtl_opt_pass (pass_data_sms, ctxt)
3325*38fd1498Szrj {}
3326*38fd1498Szrj
3327*38fd1498Szrj /* opt_pass methods: */
gate(function *)3328*38fd1498Szrj virtual bool gate (function *)
3329*38fd1498Szrj {
3330*38fd1498Szrj return (optimize > 0 && flag_modulo_sched);
3331*38fd1498Szrj }
3332*38fd1498Szrj
3333*38fd1498Szrj virtual unsigned int execute (function *);
3334*38fd1498Szrj
3335*38fd1498Szrj }; // class pass_sms
3336*38fd1498Szrj
3337*38fd1498Szrj unsigned int
execute(function * fun ATTRIBUTE_UNUSED)3338*38fd1498Szrj pass_sms::execute (function *fun ATTRIBUTE_UNUSED)
3339*38fd1498Szrj {
3340*38fd1498Szrj #ifdef INSN_SCHEDULING
3341*38fd1498Szrj basic_block bb;
3342*38fd1498Szrj
3343*38fd1498Szrj /* Collect loop information to be used in SMS. */
3344*38fd1498Szrj cfg_layout_initialize (0);
3345*38fd1498Szrj sms_schedule ();
3346*38fd1498Szrj
3347*38fd1498Szrj /* Update the life information, because we add pseudos. */
3348*38fd1498Szrj max_regno = max_reg_num ();
3349*38fd1498Szrj
3350*38fd1498Szrj /* Finalize layout changes. */
3351*38fd1498Szrj FOR_EACH_BB_FN (bb, fun)
3352*38fd1498Szrj if (bb->next_bb != EXIT_BLOCK_PTR_FOR_FN (fun))
3353*38fd1498Szrj bb->aux = bb->next_bb;
3354*38fd1498Szrj free_dominance_info (CDI_DOMINATORS);
3355*38fd1498Szrj cfg_layout_finalize ();
3356*38fd1498Szrj #endif /* INSN_SCHEDULING */
3357*38fd1498Szrj return 0;
3358*38fd1498Szrj }
3359*38fd1498Szrj
3360*38fd1498Szrj } // anon namespace
3361*38fd1498Szrj
3362*38fd1498Szrj rtl_opt_pass *
make_pass_sms(gcc::context * ctxt)3363*38fd1498Szrj make_pass_sms (gcc::context *ctxt)
3364*38fd1498Szrj {
3365*38fd1498Szrj return new pass_sms (ctxt);
3366*38fd1498Szrj }
3367