1;; Scheduling description for cell processor. 2;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009 3;; Free Software Foundation, Inc. 4;; Contributed by Sony Computer Entertainment, Inc., 5 6 7;; This file is free software; you can redistribute it and/or modify it under 8;; the terms of the GNU General Public License as published by the Free 9;; Software Foundation; either version 3 of the License, or (at your option) 10;; any later version. 11 12;; This file is distributed in the hope that it will be useful, but WITHOUT 13;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15;; for more details. 16 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. 20 21;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) 22 23;; BE Architecture *DD3.0 and DD3.1* 24;; This file simulate PPU processor unit backend of pipeline, maualP24. 25;; manual P27, stall and flush points 26;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program 27;; order, the grouped address are aligned by 8 28;; This file only simulate one thread situation 29;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, 30;; and load/store unit) 31;; VSU executes all scalar floating points insn(a float unit), 32;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) 33 34;; Dual issue combination 35 36;; FXU LSU BR VMX VMX 37;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) 38;;FXU X 39;;LSU X X X 40;;BR X 41;;VMX(sx,cx,vsu_fp,fp_arth) X 42;;VMX(perm,vsu_ls, fp_ls) X 43;; X are illegal combination. 44 45;; Dual issue exceptions: 46;;(1) nop-pipelined FXU instr in slot 0 47;;(2) non-pipelined FPU inst in slot 0 48;; CSI instr(contex-synchronizing insn) 49;; Microcode insn 50 51;; BRU unit: bru(none register stall), bru_cr(cr register stall) 52;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), 53;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for 54;; nonpipelined simulation 55;; micr insns will stall at least 7 cycles to get the first instr from ROM, 56;; micro instructions are not dual issued. 57 58;; slot0 is older than slot1 59;; non-pipelined insn need to be in slot1 to avoid 1cycle stall 60 61;; There different stall point 62;; IB2, only stall one thread if stall here, so try to stall here as much as 63;; we can 64;; condition(1) insert nop, OR and ORI instruction form 65;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or 66;; CR0-access while stdcx, or stwcx 67;; IS2 stall ;; Page91 for details 68;; VQ8 stall 69;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to 70;; the vsu issue queue 71 72;;(define_automaton "cellxu") 73 74;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") 75 76;; ndfa 77(define_automaton "cellxu,cellvsu,cellbru,cell_mis") 78 79(define_cpu_unit "fxu_cell,lsu_cell" "cellxu") 80(define_cpu_unit "bru_cell" "cellbru") 81(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") 82 83(define_cpu_unit "slot0,slot1" "cell_mis") 84 85(absence_set "slot0" "slot1") 86 87(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") 88(define_reservation "slot01" "slot0|slot1") 89 90 91;; Load/store 92;; lmw, lswi, lswx are only generated for optimize for space, MC, 93;; these instr are not simulated 94(define_insn_reservation "cell-load" 2 95 (and (eq_attr "type" "load") 96 (eq_attr "cpu" "cell")) 97 "slot01,lsu_cell") 98 99;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, 100;; if with 32bytes alignment, CMC 101(define_insn_reservation "cell-load-ux" 2 102 (and (eq_attr "type" "load_ux,load_u") 103 (eq_attr "cpu" "cell")) 104 "slot01,fxu_cell+lsu_cell") 105 106;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown 107;; 11/7, 11/8, 11/12 108(define_insn_reservation "cell-load-ext" 2 109 (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux") 110 (eq_attr "cpu" "cell")) 111 "slot01,fxu_cell+lsu_cell") 112 113;;lfs,lfsx,lfd,lfdx, 1 cycle 114(define_insn_reservation "cell-fpload" 1 115 (and (eq_attr "type" "fpload") 116 (eq_attr "cpu" "cell")) 117 "vsu2_cell+lsu_cell+slot01") 118 119;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) 120(define_insn_reservation "cell-fpload-update" 1 121 (and (eq_attr "type" "fpload,fpload_u,fpload_ux") 122 (eq_attr "cpu" "cell")) 123 "fxu_cell+vsu2_cell+lsu_cell+slot01") 124 125(define_insn_reservation "cell-vecload" 2 126 (and (eq_attr "type" "vecload") 127 (eq_attr "cpu" "cell")) 128 "slot01,vsu2_cell+lsu_cell") 129 130;;st? stw(MC) 131(define_insn_reservation "cell-store" 1 132 (and (eq_attr "type" "store") 133 (eq_attr "cpu" "cell")) 134 "lsu_cell+slot01") 135 136;;stdux, stdu, (hardware breaks into store and add) 2 for update reg 137(define_insn_reservation "cell-store-update" 1 138 (and (eq_attr "type" "store_ux,store_u") 139 (eq_attr "cpu" "cell")) 140 "fxu_cell+lsu_cell+slot01") 141 142(define_insn_reservation "cell-fpstore" 1 143 (and (eq_attr "type" "fpstore") 144 (eq_attr "cpu" "cell")) 145 "vsu2_cell+lsu_cell+slot01") 146 147(define_insn_reservation "cell-fpstore-update" 1 148 (and (eq_attr "type" "fpstore_ux,fpstore_u") 149 (eq_attr "cpu" "cell")) 150 "vsu2_cell+fxu_cell+lsu_cell+slot01") 151 152(define_insn_reservation "cell-vecstore" 1 153 (and (eq_attr "type" "vecstore") 154 (eq_attr "cpu" "cell")) 155 "vsu2_cell+lsu_cell+slot01") 156 157;; Integer latency is 2 cycles 158(define_insn_reservation "cell-integer" 2 159 (and (eq_attr "type" "integer,insert_dword,shift,trap,\ 160 var_shift_rotate,cntlz,exts,isel") 161 (eq_attr "cpu" "cell")) 162 "slot01,fxu_cell") 163 164;; Two integer latency is 4 cycles 165(define_insn_reservation "cell-two" 4 166 (and (eq_attr "type" "two") 167 (eq_attr "cpu" "cell")) 168 "slot01,fxu_cell,fxu_cell*2") 169 170;; Three integer latency is 6 cycles 171(define_insn_reservation "cell-three" 6 172 (and (eq_attr "type" "three") 173 (eq_attr "cpu" "cell")) 174 "slot01,fxu_cell,fxu_cell*4") 175 176;; rlwimi, alter cr0 177(define_insn_reservation "cell-insert" 2 178 (and (eq_attr "type" "insert_word") 179 (eq_attr "cpu" "cell")) 180 "slot01,fxu_cell") 181 182;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 183(define_insn_reservation "cell-cmp" 1 184 (and (eq_attr "type" "cmp") 185 (eq_attr "cpu" "cell")) 186 "fxu_cell+slot01") 187 188;; add, addo, sub, subo, alter cr0, rldcli, rlwinm 189(define_insn_reservation "cell-fast-cmp" 2 190 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ 191 var_delayed_compare") 192 (eq_attr "cpu" "cell")) 193 (eq_attr "cell_micro" "not")) 194 "slot01,fxu_cell") 195 196(define_insn_reservation "cell-cmp-microcoded" 9 197 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\ 198 var_delayed_compare") 199 (eq_attr "cpu" "cell")) 200 (eq_attr "cell_micro" "always")) 201 "slot0+slot1,fxu_cell,fxu_cell*7") 202 203;; mulld 204(define_insn_reservation "cell-lmul" 15 205 (and (eq_attr "type" "lmul") 206 (eq_attr "cpu" "cell")) 207 "slot1,nonpipeline,nonpipeline*13") 208 209;; mulld. is microcoded 210(define_insn_reservation "cell-lmul-cmp" 22 211 (and (eq_attr "type" "lmul_compare") 212 (eq_attr "cpu" "cell")) 213 "slot0+slot1,nonpipeline,nonpipeline*20") 214 215;; mulli, 6 cycles 216(define_insn_reservation "cell-imul23" 6 217 (and (eq_attr "type" "imul2,imul3") 218 (eq_attr "cpu" "cell")) 219 "slot1,nonpipeline,nonpipeline*4") 220 221;; mullw, 9 222(define_insn_reservation "cell-imul" 9 223 (and (eq_attr "type" "imul") 224 (eq_attr "cpu" "cell")) 225 "slot1,nonpipeline,nonpipeline*7") 226 227;; divide 228(define_insn_reservation "cell-idiv" 32 229 (and (eq_attr "type" "idiv") 230 (eq_attr "cpu" "cell")) 231 "slot1,nonpipeline,nonpipeline*30") 232 233(define_insn_reservation "cell-ldiv" 64 234 (and (eq_attr "type" "ldiv") 235 (eq_attr "cpu" "cell")) 236 "slot1,nonpipeline,nonpipeline*62") 237 238;;mflr and mfctr are pipelined 239(define_insn_reservation "cell-mfjmpr" 1 240 (and (eq_attr "type" "mfjmpr") 241 (eq_attr "cpu" "cell")) 242 "slot01+bru_cell") 243 244;;mtlr and mtctr, 245;;mtspr fully pipelined 246(define_insn_reservation "cell-mtjmpr" 1 247 (and (eq_attr "type" "mtjmpr") 248 (eq_attr "cpu" "cell")) 249 "bru_cell+slot01") 250 251;; Branches 252;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency 253;; bcctr, bcctrl, latency 2, actually adjust by be to 4 254(define_insn_reservation "cell-branch" 1 255 (and (eq_attr "type" "branch") 256 (eq_attr "cpu" "cell")) 257 "bru_cell+slot1") 258 259(define_insn_reservation "cell-branchreg" 1 260 (and (eq_attr "type" "jmpreg") 261 (eq_attr "cpu" "cell")) 262 "bru_cell+slot1") 263 264;; cr hazard 265;; page 90, special cases for CR hazard, only one instr can access cr per cycle 266;; if insn reads CR following a stwcx, pipeline stall till stwcx finish 267(define_insn_reservation "cell-crlogical" 1 268 (and (eq_attr "type" "cr_logical,delayed_cr") 269 (eq_attr "cpu" "cell")) 270 "bru_cell+slot01") 271 272;; mfcrf and mfcr is about 34 cycles and nonpipelined 273(define_insn_reservation "cell-mfcr" 34 274 (and (eq_attr "type" "mfcrf,mfcr") 275 (eq_attr "cpu" "cell")) 276 "slot1,nonpipeline,nonpipeline*32") 277 278;; mtcrf (1 field) 279(define_insn_reservation "cell-mtcrf" 1 280 (and (eq_attr "type" "mtcr") 281 (eq_attr "cpu" "cell")) 282 "fxu_cell+slot01") 283 284; Basic FP latency is 10 cycles, thoughput is 1/cycle 285(define_insn_reservation "cell-fp" 10 286 (and (eq_attr "type" "fp,dmul") 287 (eq_attr "cpu" "cell")) 288 "slot01,vsu1_cell,vsu1_cell*8") 289 290(define_insn_reservation "cell-fpcompare" 1 291 (and (eq_attr "type" "fpcompare") 292 (eq_attr "cpu" "cell")) 293 "vsu1_cell+slot01") 294 295;; sdiv thoughput 1/74, not pipelined but only in the FPU 296(define_insn_reservation "cell-sdiv" 74 297 (and (eq_attr "type" "sdiv,ddiv") 298 (eq_attr "cpu" "cell")) 299 "slot1,nonpipeline,nonpipeline*72") 300 301;; fsqrt thoughput 1/84, not pipelined but only in the FPU 302(define_insn_reservation "cell-sqrt" 84 303 (and (eq_attr "type" "ssqrt,dsqrt") 304 (eq_attr "cpu" "cell")) 305 "slot1,nonpipeline,nonpipeline*82") 306 307; VMX 308(define_insn_reservation "cell-vecsimple" 4 309 (and (eq_attr "type" "vecsimple") 310 (eq_attr "cpu" "cell")) 311 "slot01,vsu1_cell,vsu1_cell*2") 312 313;; mult, div, madd 314(define_insn_reservation "cell-veccomplex" 10 315 (and (eq_attr "type" "veccomplex") 316 (eq_attr "cpu" "cell")) 317 "slot01,vsu1_cell,vsu1_cell*8") 318 319;; TODO: add support for recording instructions 320(define_insn_reservation "cell-veccmp" 4 321 (and (eq_attr "type" "veccmp") 322 (eq_attr "cpu" "cell")) 323 "slot01,vsu1_cell,vsu1_cell*2") 324 325(define_insn_reservation "cell-vecfloat" 12 326 (and (eq_attr "type" "vecfloat") 327 (eq_attr "cpu" "cell")) 328 "slot01,vsu1_cell,vsu1_cell*10") 329 330(define_insn_reservation "cell-vecperm" 4 331 (and (eq_attr "type" "vecperm") 332 (eq_attr "cpu" "cell")) 333 "slot01,vsu2_cell,vsu2_cell*2") 334 335;; New for 4.2, syncs 336 337(define_insn_reservation "cell-sync" 11 338 (and (eq_attr "type" "sync") 339 (eq_attr "cpu" "cell")) 340 "slot01,lsu_cell,lsu_cell*9") 341 342(define_insn_reservation "cell-isync" 11 343 (and (eq_attr "type" "isync") 344 (eq_attr "cpu" "cell")) 345 "slot01,lsu_cell,lsu_cell*9") 346 347(define_insn_reservation "cell-load_l" 11 348 (and (eq_attr "type" "load_l") 349 (eq_attr "cpu" "cell")) 350 "slot01,lsu_cell,lsu_cell*9") 351 352(define_insn_reservation "cell-store_c" 11 353 (and (eq_attr "type" "store_c") 354 (eq_attr "cpu" "cell")) 355 "slot01,lsu_cell,lsu_cell*9") 356 357;; RAW register dependency 358 359;; addi r3, r3, 1 360;; lw r4,offset(r3) 361;; there are 5 cycle deplay for r3 bypassing 362;; there are 5 cycle delay for a dependent load after a load 363(define_bypass 5 "cell-integer" "cell-load") 364(define_bypass 5 "cell-integer" "cell-load-ext") 365(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") 366 367;; there is a 6 cycle delay after a fp compare until you can use the cr. 368(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") 369 370;; VXU float RAW 371(define_bypass 11 "cell-vecfloat" "cell-vecfloat") 372 373;; VXU and FPU 374(define_bypass 6 "cell-veccomplex" "cell-vecsimple") 375;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") 376(define_bypass 3 "cell-vecfloat" "cell-veccomplex") 377; this is not correct, 378;; this is a stall in general and not dependent on result 379(define_bypass 13 "cell-vecstore" "cell-fpstore") 380; this is not correct, this can never be true, not dependent on result 381(define_bypass 7 "cell-fp" "cell-fpload") 382;; vsu1 should avoid writing to the same target register as vsu2 insn 383;; within 12 cycles. 384 385;; WAW hazard 386 387;; the target of VSU estimate should not be reused within 10 dispatch groups 388;; the target of VSU float should not be reused within 8 dispatch groups 389;; the target of VSU complex should not be reused within 5 dispatch groups 390;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus 391 392;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at 393;; ex4 stage(10 cycles) 394(define_bypass 10 "cell-mtjmpr" "cell-branchreg") 395 396;;Things are not simulated: 397;; update instruction, update address gpr are not simulated 398;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float 399;; insns 400 401