1;; ARM 1136J[F]-S Pipeline Description 2;; Copyright (C) 2003, 2007 Free Software Foundation, Inc. 3;; Written by CodeSourcery, LLC. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 3, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. */ 20 21;; These descriptions are based on the information contained in the 22;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM 23;; Limited. 24;; 25 26;; This automaton provides a pipeline description for the ARM 27;; 1136J-S and 1136JF-S cores. 28;; 29;; The model given here assumes that the condition for all conditional 30;; instructions is "true", i.e., that all of the instructions are 31;; actually executed. 32 33(define_automaton "arm1136jfs") 34 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36;; Pipelines 37;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38 39;; There are three distinct pipelines (page 1-26 and following): 40;; 41;; - A 4-stage decode pipeline, shared by all three. It has fetch (1), 42;; fetch (2), decode, and issue stages. Since this is always involved, 43;; we do not model it in the scheduler. 44;; 45;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations), 46;; and saturation stages. The fourth stage is writeback; see below. 47;; 48;; - A 4-stage multiply-accumulate pipeline. It has three stages, called 49;; MAC1 through MAC3, and a fourth writeback stage. 50;; 51;; The 4th-stage writeback is shared between the ALU and MAC pipelines, 52;; which operate in lockstep. Results from either pipeline will be 53;; moved into the writeback stage. Because the two pipelines operate 54;; in lockstep, we schedule them as a single "execute" pipeline. 55;; 56;; - A 4-stage LSU pipeline. It has address generation, data cache (1), 57;; data cache (2), and writeback stages. (Note that this pipeline, 58;; including the writeback stage, is independent from the ALU & LSU pipes.) 59 60(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC 61; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3 62(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store 63 64;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 65;; ALU Instructions 66;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 67 68;; ALU instructions require eight cycles to execute, and use the ALU 69;; pipeline in each of the eight stages. The results are available 70;; after the alu stage has finished. 71;; 72;; If the destination register is the PC, the pipelines are stalled 73;; for several cycles. That case is not modelled here. 74 75;; ALU operations with no shifted operand 76(define_insn_reservation "11_alu_op" 2 77 (and (eq_attr "tune" "arm1136js,arm1136jfs") 78 (eq_attr "type" "alu")) 79 "e_1,e_2,e_3,e_wb") 80 81;; ALU operations with a shift-by-constant operand 82(define_insn_reservation "11_alu_shift_op" 2 83 (and (eq_attr "tune" "arm1136js,arm1136jfs") 84 (eq_attr "type" "alu_shift")) 85 "e_1,e_2,e_3,e_wb") 86 87;; ALU operations with a shift-by-register operand 88;; These really stall in the decoder, in order to read 89;; the shift value in a second cycle. Pretend we take two cycles in 90;; the shift stage. 91(define_insn_reservation "11_alu_shift_reg_op" 3 92 (and (eq_attr "tune" "arm1136js,arm1136jfs") 93 (eq_attr "type" "alu_shift_reg")) 94 "e_1*2,e_2,e_3,e_wb") 95 96;; alu_ops can start sooner, if there is no shifter dependency 97(define_bypass 1 "11_alu_op,11_alu_shift_op" 98 "11_alu_op") 99(define_bypass 1 "11_alu_op,11_alu_shift_op" 100 "11_alu_shift_op" 101 "arm_no_early_alu_shift_value_dep") 102(define_bypass 1 "11_alu_op,11_alu_shift_op" 103 "11_alu_shift_reg_op" 104 "arm_no_early_alu_shift_dep") 105(define_bypass 2 "11_alu_shift_reg_op" 106 "11_alu_op") 107(define_bypass 2 "11_alu_shift_reg_op" 108 "11_alu_shift_op" 109 "arm_no_early_alu_shift_value_dep") 110(define_bypass 2 "11_alu_shift_reg_op" 111 "11_alu_shift_reg_op" 112 "arm_no_early_alu_shift_dep") 113 114(define_bypass 1 "11_alu_op,11_alu_shift_op" 115 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 116 "arm_no_early_mul_dep") 117(define_bypass 2 "11_alu_shift_reg_op" 118 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 119 "arm_no_early_mul_dep") 120 121;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 122;; Multiplication Instructions 123;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 124 125;; Multiplication instructions loop in the first two execute stages until 126;; the instruction has been passed through the multiplier array enough 127;; times. 128 129;; Multiply and multiply-accumulate results are available after four stages. 130(define_insn_reservation "11_mult1" 4 131 (and (eq_attr "tune" "arm1136js,arm1136jfs") 132 (eq_attr "insn" "mul,mla")) 133 "e_1*2,e_2,e_3,e_wb") 134 135;; The *S variants set the condition flags, which requires three more cycles. 136(define_insn_reservation "11_mult2" 4 137 (and (eq_attr "tune" "arm1136js,arm1136jfs") 138 (eq_attr "insn" "muls,mlas")) 139 "e_1*2,e_2,e_3,e_wb") 140 141(define_bypass 3 "11_mult1,11_mult2" 142 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 143 "arm_no_early_mul_dep") 144(define_bypass 3 "11_mult1,11_mult2" 145 "11_alu_op") 146(define_bypass 3 "11_mult1,11_mult2" 147 "11_alu_shift_op" 148 "arm_no_early_alu_shift_value_dep") 149(define_bypass 3 "11_mult1,11_mult2" 150 "11_alu_shift_reg_op" 151 "arm_no_early_alu_shift_dep") 152(define_bypass 3 "11_mult1,11_mult2" 153 "11_store1" 154 "arm_no_early_store_addr_dep") 155 156;; Signed and unsigned multiply long results are available across two cycles; 157;; the less significant word is available one cycle before the more significant 158;; word. Here we conservatively wait until both are available, which is 159;; after three iterations and the memory cycle. The same is also true of 160;; the two multiply-accumulate instructions. 161(define_insn_reservation "11_mult3" 5 162 (and (eq_attr "tune" "arm1136js,arm1136jfs") 163 (eq_attr "insn" "smull,umull,smlal,umlal")) 164 "e_1*3,e_2,e_3,e_wb*2") 165 166;; The *S variants set the condition flags, which requires three more cycles. 167(define_insn_reservation "11_mult4" 5 168 (and (eq_attr "tune" "arm1136js,arm1136jfs") 169 (eq_attr "insn" "smulls,umulls,smlals,umlals")) 170 "e_1*3,e_2,e_3,e_wb*2") 171 172(define_bypass 4 "11_mult3,11_mult4" 173 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 174 "arm_no_early_mul_dep") 175(define_bypass 4 "11_mult3,11_mult4" 176 "11_alu_op") 177(define_bypass 4 "11_mult3,11_mult4" 178 "11_alu_shift_op" 179 "arm_no_early_alu_shift_value_dep") 180(define_bypass 4 "11_mult3,11_mult4" 181 "11_alu_shift_reg_op" 182 "arm_no_early_alu_shift_dep") 183(define_bypass 4 "11_mult3,11_mult4" 184 "11_store1" 185 "arm_no_early_store_addr_dep") 186 187;; Various 16x16->32 multiplies and multiply-accumulates, using combinations 188;; of high and low halves of the argument registers. They take a single 189;; pass through the pipeline and make the result available after three 190;; cycles. 191(define_insn_reservation "11_mult5" 3 192 (and (eq_attr "tune" "arm1136js,arm1136jfs") 193 (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx")) 194 "e_1,e_2,e_3,e_wb") 195 196(define_bypass 2 "11_mult5" 197 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 198 "arm_no_early_mul_dep") 199(define_bypass 2 "11_mult5" 200 "11_alu_op") 201(define_bypass 2 "11_mult5" 202 "11_alu_shift_op" 203 "arm_no_early_alu_shift_value_dep") 204(define_bypass 2 "11_mult5" 205 "11_alu_shift_reg_op" 206 "arm_no_early_alu_shift_dep") 207(define_bypass 2 "11_mult5" 208 "11_store1" 209 "arm_no_early_store_addr_dep") 210 211;; The same idea, then the 32-bit result is added to a 64-bit quantity. 212(define_insn_reservation "11_mult6" 4 213 (and (eq_attr "tune" "arm1136js,arm1136jfs") 214 (eq_attr "insn" "smlalxy")) 215 "e_1*2,e_2,e_3,e_wb*2") 216 217;; Signed 32x32 multiply, then the most significant 32 bits are extracted 218;; and are available after the memory stage. 219(define_insn_reservation "11_mult7" 4 220 (and (eq_attr "tune" "arm1136js,arm1136jfs") 221 (eq_attr "insn" "smmul,smmulr")) 222 "e_1*2,e_2,e_3,e_wb") 223 224(define_bypass 3 "11_mult6,11_mult7" 225 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 226 "arm_no_early_mul_dep") 227(define_bypass 3 "11_mult6,11_mult7" 228 "11_alu_op") 229(define_bypass 3 "11_mult6,11_mult7" 230 "11_alu_shift_op" 231 "arm_no_early_alu_shift_value_dep") 232(define_bypass 3 "11_mult6,11_mult7" 233 "11_alu_shift_reg_op" 234 "arm_no_early_alu_shift_dep") 235(define_bypass 3 "11_mult6,11_mult7" 236 "11_store1" 237 "arm_no_early_store_addr_dep") 238 239;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 240;; Branch Instructions 241;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 242 243;; These vary greatly depending on their arguments and the results of 244;; stat prediction. Cycle count ranges from zero (unconditional branch, 245;; folded dynamic prediction) to seven (incorrect predictions, etc). We 246;; assume an optimal case for now, because the cost of a cache miss 247;; overwhelms the cost of everything else anyhow. 248 249(define_insn_reservation "11_branches" 0 250 (and (eq_attr "tune" "arm1136js,arm1136jfs") 251 (eq_attr "type" "branch")) 252 "nothing") 253 254;; Call latencies are not predictable. A semi-arbitrary very large 255;; number is used as "positive infinity" so that everything should be 256;; finished by the time of return. 257(define_insn_reservation "11_call" 32 258 (and (eq_attr "tune" "arm1136js,arm1136jfs") 259 (eq_attr "type" "call")) 260 "nothing") 261 262;; Branches are predicted. A correctly predicted branch will be no 263;; cost, but we're conservative here, and use the timings a 264;; late-register would give us. 265(define_bypass 1 "11_alu_op,11_alu_shift_op" 266 "11_branches") 267(define_bypass 2 "11_alu_shift_reg_op" 268 "11_branches") 269(define_bypass 2 "11_load1,11_load2" 270 "11_branches") 271(define_bypass 3 "11_load34" 272 "11_branches") 273 274;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 275;; Load/Store Instructions 276;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 277 278;; The models for load/store instructions do not accurately describe 279;; the difference between operations with a base register writeback. 280;; These models assume that all memory references hit in dcache. Also, 281;; if the PC is one of the registers involved, there are additional stalls 282;; not modelled here. Addressing modes are also not modelled. 283 284(define_insn_reservation "11_load1" 3 285 (and (eq_attr "tune" "arm1136js,arm1136jfs") 286 (eq_attr "type" "load1")) 287 "l_a+e_1,l_dc1,l_dc2,l_wb") 288 289;; Load byte results are not available until the writeback stage, where 290;; the correct byte is extracted. 291 292(define_insn_reservation "11_loadb" 4 293 (and (eq_attr "tune" "arm1136js,arm1136jfs") 294 (eq_attr "type" "load_byte")) 295 "l_a+e_1,l_dc1,l_dc2,l_wb") 296 297(define_insn_reservation "11_store1" 0 298 (and (eq_attr "tune" "arm1136js,arm1136jfs") 299 (eq_attr "type" "store1")) 300 "l_a+e_1,l_dc1,l_dc2,l_wb") 301 302;; Load/store double words into adjacent registers. The timing and 303;; latencies are different depending on whether the address is 64-bit 304;; aligned. This model assumes that it is. 305(define_insn_reservation "11_load2" 3 306 (and (eq_attr "tune" "arm1136js,arm1136jfs") 307 (eq_attr "type" "load2")) 308 "l_a+e_1,l_dc1,l_dc2,l_wb") 309 310(define_insn_reservation "11_store2" 0 311 (and (eq_attr "tune" "arm1136js,arm1136jfs") 312 (eq_attr "type" "store2")) 313 "l_a+e_1,l_dc1,l_dc2,l_wb") 314 315;; Load/store multiple registers. Two registers are stored per cycle. 316;; Actual timing depends on how many registers are affected, so we 317;; optimistically schedule a low latency. 318(define_insn_reservation "11_load34" 4 319 (and (eq_attr "tune" "arm1136js,arm1136jfs") 320 (eq_attr "type" "load3,load4")) 321 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 322 323(define_insn_reservation "11_store34" 0 324 (and (eq_attr "tune" "arm1136js,arm1136jfs") 325 (eq_attr "type" "store3,store4")) 326 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 327 328;; A store can start immediately after an alu op, if that alu op does 329;; not provide part of the address to access. 330(define_bypass 1 "11_alu_op,11_alu_shift_op" 331 "11_store1" 332 "arm_no_early_store_addr_dep") 333(define_bypass 2 "11_alu_shift_reg_op" 334 "11_store1" 335 "arm_no_early_store_addr_dep") 336 337;; An alu op can start sooner after a load, if that alu op does not 338;; have an early register dependency on the load 339(define_bypass 2 "11_load1" 340 "11_alu_op") 341(define_bypass 2 "11_load1" 342 "11_alu_shift_op" 343 "arm_no_early_alu_shift_value_dep") 344(define_bypass 2 "11_load1" 345 "11_alu_shift_reg_op" 346 "arm_no_early_alu_shift_dep") 347 348(define_bypass 3 "11_loadb" 349 "11_alu_op") 350(define_bypass 3 "11_loadb" 351 "11_alu_shift_op" 352 "arm_no_early_alu_shift_value_dep") 353(define_bypass 3 "11_loadb" 354 "11_alu_shift_reg_op" 355 "arm_no_early_alu_shift_dep") 356 357;; A mul op can start sooner after a load, if that mul op does not 358;; have an early multiply dependency 359(define_bypass 2 "11_load1" 360 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 361 "arm_no_early_mul_dep") 362(define_bypass 3 "11_load34" 363 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 364 "arm_no_early_mul_dep") 365(define_bypass 3 "11_loadb" 366 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 367 "arm_no_early_mul_dep") 368 369;; A store can start sooner after a load, if that load does not 370;; produce part of the address to access 371(define_bypass 2 "11_load1" 372 "11_store1" 373 "arm_no_early_store_addr_dep") 374(define_bypass 3 "11_loadb" 375 "11_store1" 376 "arm_no_early_store_addr_dep") 377