xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/rs6000/cell.md (revision 230b95665bbd3a9d1a53658a36b1053f8382a519)
1;; Scheduling description for cell processor.
2;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
3;; Free Software Foundation, Inc.
4;; Contributed by Sony Computer Entertainment, Inc.,
5
6
7;; This file is free software; you can redistribute it and/or modify it under
8;; the terms of the GNU General Public License as published by the Free
9;; Software Foundation; either version 3 of the License, or (at your option)
10;; any later version.
11
12;; This file is distributed in the hope that it will be useful, but WITHOUT
13;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15;; for more details.
16
17;; You should have received a copy of the GNU General Public License
18;; along with GCC; see the file COPYING3.  If not see
19;; <http://www.gnu.org/licenses/>.
20
21;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
22
23;; BE Architecture *DD3.0 and DD3.1*
24;; This file simulate PPU processor unit backend of pipeline, maualP24.
25;; manual P27, stall and flush points
26;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
27;;  order, the grouped address are aligned by 8
28;; This file only simulate one thread situation
29;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
30;;   and load/store unit)
31;; VSU executes all scalar floating points insn(a float unit),
32;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
33
34;; Dual issue combination
35
36;;	FXU	LSU	BR 	        VMX	               VMX
37;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
38;;FXU	X
39;;LSU		X               	X               	X
40;;BR			X
41;;VMX(sx,cx,vsu_fp,fp_arth)		X
42;;VMX(perm,vsu_ls, fp_ls)					X
43;;    X are illegal combination.
44
45;; Dual issue exceptions:
46;;(1) nop-pipelined FXU instr in slot 0
47;;(2) non-pipelined FPU inst in slot 0
48;; CSI instr(contex-synchronizing insn)
49;; Microcode insn
50
51;; BRU unit: bru(none register stall), bru_cr(cr register stall)
52;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
53;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
54;;  nonpipelined simulation
55;; micr insns will stall at least 7 cycles to get the first instr from ROM,
56;;  micro instructions are not dual issued.
57
58;; slot0 is older than slot1
59;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
60
61;; There different stall point
62;; IB2, only stall one thread if stall here, so try to stall here as much as
63;; we can
64;; condition(1) insert nop, OR and ORI instruction form
65;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
66;;   CR0-access while stdcx, or stwcx
67;; IS2 stall ;; Page91 for details
68;; VQ8 stall
69;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
70;;  the vsu issue queue
71
72;;(define_automaton "cellxu")
73
74;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
75
76;; ndfa
77(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
78
79(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
80(define_cpu_unit "bru_cell" "cellbru")
81(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
82
83(define_cpu_unit "slot0,slot1" "cell_mis")
84
85(absence_set "slot0" "slot1")
86
87(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
88(define_reservation "slot01" "slot0|slot1")
89
90
91;; Load/store
92;; lmw, lswi, lswx are only generated for optimize for space, MC,
93;;   these instr are not simulated
94(define_insn_reservation "cell-load" 2
95  (and (eq_attr "type" "load")
96       (eq_attr "cpu" "cell"))
97  "slot01,lsu_cell")
98
99;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
100;;  if with 32bytes alignment, CMC
101(define_insn_reservation "cell-load-ux" 2
102  (and (eq_attr "type" "load_ux,load_u")
103       (eq_attr "cpu" "cell"))
104  "slot01,fxu_cell+lsu_cell")
105
106;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
107;;   11/7, 11/8, 11/12
108(define_insn_reservation "cell-load-ext" 2
109  (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
110       (eq_attr "cpu" "cell"))
111  "slot01,fxu_cell+lsu_cell")
112
113;;lfs,lfsx,lfd,lfdx, 1 cycle
114(define_insn_reservation "cell-fpload" 1
115  (and (eq_attr "type" "fpload")
116       (eq_attr "cpu" "cell"))
117  "vsu2_cell+lsu_cell+slot01")
118
119;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
120(define_insn_reservation "cell-fpload-update" 1
121  (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
122       (eq_attr "cpu" "cell"))
123  "fxu_cell+vsu2_cell+lsu_cell+slot01")
124
125(define_insn_reservation "cell-vecload" 2
126  (and (eq_attr "type" "vecload")
127       (eq_attr "cpu" "cell"))
128  "slot01,vsu2_cell+lsu_cell")
129
130;;st? stw(MC)
131(define_insn_reservation "cell-store" 1
132  (and (eq_attr "type" "store")
133       (eq_attr "cpu" "cell"))
134  "lsu_cell+slot01")
135
136;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
137(define_insn_reservation "cell-store-update" 1
138  (and (eq_attr "type" "store_ux,store_u")
139       (eq_attr "cpu" "cell"))
140  "fxu_cell+lsu_cell+slot01")
141
142(define_insn_reservation "cell-fpstore" 1
143  (and (eq_attr "type" "fpstore")
144       (eq_attr "cpu" "cell"))
145  "vsu2_cell+lsu_cell+slot01")
146
147(define_insn_reservation "cell-fpstore-update" 1
148  (and (eq_attr "type" "fpstore_ux,fpstore_u")
149       (eq_attr "cpu" "cell"))
150  "vsu2_cell+fxu_cell+lsu_cell+slot01")
151
152(define_insn_reservation "cell-vecstore" 1
153  (and (eq_attr "type" "vecstore")
154       (eq_attr "cpu" "cell"))
155  "vsu2_cell+lsu_cell+slot01")
156
157;; Integer latency is 2 cycles
158(define_insn_reservation "cell-integer" 2
159  (and (eq_attr "type" "integer,insert_dword,shift,trap,\
160			var_shift_rotate,cntlz,exts,isel")
161       (eq_attr "cpu" "cell"))
162  "slot01,fxu_cell")
163
164;; Two integer latency is 4 cycles
165(define_insn_reservation "cell-two" 4
166  (and (eq_attr "type" "two")
167       (eq_attr "cpu" "cell"))
168  "slot01,fxu_cell,fxu_cell*2")
169
170;; Three integer latency is 6 cycles
171(define_insn_reservation "cell-three" 6
172  (and (eq_attr "type" "three")
173       (eq_attr "cpu" "cell"))
174  "slot01,fxu_cell,fxu_cell*4")
175
176;; rlwimi, alter cr0
177(define_insn_reservation "cell-insert" 2
178  (and (eq_attr "type" "insert_word")
179       (eq_attr "cpu" "cell"))
180 "slot01,fxu_cell")
181
182;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
183(define_insn_reservation "cell-cmp" 1
184  (and (eq_attr "type" "cmp")
185       (eq_attr "cpu" "cell"))
186  "fxu_cell+slot01")
187
188;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
189(define_insn_reservation "cell-fast-cmp" 2
190  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
191			    var_delayed_compare")
192            (eq_attr "cpu" "cell"))
193        (eq_attr "cell_micro" "not"))
194  "slot01,fxu_cell")
195
196(define_insn_reservation "cell-cmp-microcoded" 9
197  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
198			    var_delayed_compare")
199            (eq_attr "cpu" "cell"))
200        (eq_attr "cell_micro" "always"))
201  "slot0+slot1,fxu_cell,fxu_cell*7")
202
203;; mulld
204(define_insn_reservation "cell-lmul" 15
205  (and (eq_attr "type" "lmul")
206       (eq_attr "cpu" "cell"))
207  "slot1,nonpipeline,nonpipeline*13")
208
209;; mulld. is microcoded
210(define_insn_reservation "cell-lmul-cmp" 22
211  (and (eq_attr "type" "lmul_compare")
212       (eq_attr "cpu" "cell"))
213  "slot0+slot1,nonpipeline,nonpipeline*20")
214
215;; mulli, 6 cycles
216(define_insn_reservation "cell-imul23" 6
217  (and (eq_attr "type" "imul2,imul3")
218       (eq_attr "cpu" "cell"))
219  "slot1,nonpipeline,nonpipeline*4")
220
221;; mullw, 9
222(define_insn_reservation "cell-imul" 9
223  (and (eq_attr "type" "imul")
224       (eq_attr "cpu" "cell"))
225  "slot1,nonpipeline,nonpipeline*7")
226
227;; divide
228(define_insn_reservation "cell-idiv" 32
229  (and (eq_attr "type" "idiv")
230       (eq_attr "cpu" "cell"))
231  "slot1,nonpipeline,nonpipeline*30")
232
233(define_insn_reservation "cell-ldiv" 64
234  (and (eq_attr "type" "ldiv")
235       (eq_attr "cpu" "cell"))
236  "slot1,nonpipeline,nonpipeline*62")
237
238;;mflr and mfctr are pipelined
239(define_insn_reservation "cell-mfjmpr" 1
240  (and (eq_attr "type" "mfjmpr")
241       (eq_attr "cpu" "cell"))
242  "slot01+bru_cell")
243
244;;mtlr and mtctr,
245;;mtspr fully pipelined
246(define_insn_reservation "cell-mtjmpr" 1
247 (and (eq_attr "type" "mtjmpr")
248       (eq_attr "cpu" "cell"))
249  "bru_cell+slot01")
250
251;; Branches
252;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
253;; bcctr, bcctrl, latency 2, actually adjust by be to 4
254(define_insn_reservation "cell-branch" 1
255  (and (eq_attr "type" "branch")
256       (eq_attr "cpu" "cell"))
257  "bru_cell+slot1")
258
259(define_insn_reservation "cell-branchreg" 1
260  (and (eq_attr "type" "jmpreg")
261       (eq_attr "cpu" "cell"))
262  "bru_cell+slot1")
263
264;; cr hazard
265;; page 90, special cases for CR hazard, only one instr can access cr per cycle
266;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
267(define_insn_reservation "cell-crlogical" 1
268  (and (eq_attr "type" "cr_logical,delayed_cr")
269       (eq_attr "cpu" "cell"))
270  "bru_cell+slot01")
271
272;; mfcrf and mfcr is about 34 cycles and nonpipelined
273(define_insn_reservation "cell-mfcr" 34
274  (and (eq_attr "type" "mfcrf,mfcr")
275       (eq_attr "cpu" "cell"))
276   "slot1,nonpipeline,nonpipeline*32")
277
278;; mtcrf (1 field)
279(define_insn_reservation "cell-mtcrf" 1
280  (and (eq_attr "type" "mtcr")
281       (eq_attr "cpu" "cell"))
282  "fxu_cell+slot01")
283
284; Basic FP latency is 10 cycles, thoughput is 1/cycle
285(define_insn_reservation "cell-fp" 10
286  (and (eq_attr "type" "fp,dmul")
287       (eq_attr "cpu" "cell"))
288  "slot01,vsu1_cell,vsu1_cell*8")
289
290(define_insn_reservation "cell-fpcompare" 1
291  (and (eq_attr "type" "fpcompare")
292       (eq_attr "cpu" "cell"))
293  "vsu1_cell+slot01")
294
295;; sdiv thoughput 1/74, not pipelined but only in the FPU
296(define_insn_reservation "cell-sdiv" 74
297  (and (eq_attr "type" "sdiv,ddiv")
298       (eq_attr "cpu" "cell"))
299  "slot1,nonpipeline,nonpipeline*72")
300
301;; fsqrt thoughput 1/84, not pipelined but only in the FPU
302(define_insn_reservation "cell-sqrt" 84
303  (and (eq_attr "type" "ssqrt,dsqrt")
304       (eq_attr "cpu" "cell"))
305  "slot1,nonpipeline,nonpipeline*82")
306
307; VMX
308(define_insn_reservation "cell-vecsimple" 4
309  (and (eq_attr "type" "vecsimple")
310       (eq_attr "cpu" "cell"))
311  "slot01,vsu1_cell,vsu1_cell*2")
312
313;; mult, div, madd
314(define_insn_reservation "cell-veccomplex" 10
315  (and (eq_attr "type" "veccomplex")
316       (eq_attr "cpu" "cell"))
317  "slot01,vsu1_cell,vsu1_cell*8")
318
319;; TODO: add support for recording instructions
320(define_insn_reservation "cell-veccmp" 4
321  (and (eq_attr "type" "veccmp")
322       (eq_attr "cpu" "cell"))
323  "slot01,vsu1_cell,vsu1_cell*2")
324
325(define_insn_reservation "cell-vecfloat" 12
326  (and (eq_attr "type" "vecfloat")
327       (eq_attr "cpu" "cell"))
328  "slot01,vsu1_cell,vsu1_cell*10")
329
330(define_insn_reservation "cell-vecperm" 4
331  (and (eq_attr "type" "vecperm")
332       (eq_attr "cpu" "cell"))
333  "slot01,vsu2_cell,vsu2_cell*2")
334
335;; New for 4.2, syncs
336
337(define_insn_reservation "cell-sync" 11
338  (and (eq_attr "type" "sync")
339       (eq_attr "cpu" "cell"))
340  "slot01,lsu_cell,lsu_cell*9")
341
342(define_insn_reservation "cell-isync" 11
343  (and (eq_attr "type" "isync")
344       (eq_attr "cpu" "cell"))
345  "slot01,lsu_cell,lsu_cell*9")
346
347(define_insn_reservation "cell-load_l" 11
348  (and (eq_attr "type" "load_l")
349       (eq_attr "cpu" "cell"))
350  "slot01,lsu_cell,lsu_cell*9")
351
352(define_insn_reservation "cell-store_c" 11
353  (and (eq_attr "type" "store_c")
354       (eq_attr "cpu" "cell"))
355  "slot01,lsu_cell,lsu_cell*9")
356
357;; RAW register dependency
358
359;; addi r3, r3, 1
360;; lw r4,offset(r3)
361;; there are 5 cycle deplay for r3 bypassing
362;; there are 5 cycle delay for a dependent load after a load
363(define_bypass 5 "cell-integer" "cell-load")
364(define_bypass 5 "cell-integer" "cell-load-ext")
365(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
366
367;; there is a 6 cycle delay after a fp compare until you can use the cr.
368(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
369
370;; VXU float RAW
371(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
372
373;; VXU and FPU
374(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
375;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
376(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
377; this is not correct,
378;;  this is a stall in general and not dependent on result
379(define_bypass 13 "cell-vecstore" "cell-fpstore")
380; this is not correct, this can never be true, not dependent on result
381(define_bypass 7 "cell-fp" "cell-fpload")
382;; vsu1 should avoid writing to the same target register as vsu2 insn
383;;   within 12 cycles.
384
385;; WAW hazard
386
387;; the target of VSU estimate should not be reused within 10 dispatch groups
388;; the target of VSU float should not be reused within 8 dispatch groups
389;; the target of VSU complex should not be reused within 5 dispatch groups
390;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
391
392;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
393;;  ex4 stage(10 cycles)
394(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
395
396;;Things are not simulated:
397;; update instruction, update address gpr are not simulated
398;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
399;;  insns
400
401